def et(et_dataloader, max_et_unseen_acc, et_label_list,
                           et_hypo_seen_str_indicator, et_hypo_2_type_index):
                        model.eval()
                        et_loss, et_step, preds = 0, 0, []
                        for input_ids, input_mask, segment_ids, label_ids in et_dataloader:
                            input_ids, input_mask, segment_ids, label_ids = input_ids.to(
                                device), input_mask.to(device), segment_ids.to(
                                    device), label_ids.to(device)
                            with torch.no_grad():
                                logits = model(input_ids,
                                               segment_ids,
                                               input_mask,
                                               labels=None)[0]
                            tmp_et_loss = CrossEntropyLoss()(logits.view(
                                -1, num_labels), label_ids.view(-1))
                            et_loss += tmp_et_loss.mean().item()
                            et_step += 1
                            if len(preds) == 0:
                                preds.append(logits.detach().cpu().numpy())
                                # 进行反向传播时,到该调用detach()的Variable就会停止,不能再继续向前进行传播.
                                # cpu()函数作用是将数据从GPU上复制到memory上,相对应的函数是cuda()
                            else:
                                preds[0] = np.append(
                                    preds[0],
                                    logits.detach().cpu().numpy(),
                                    axis=0)
                        et_loss = et_loss / et_step
                        preds = preds[0]
                        '''
                        preds: size*2 (entail, not_entail)
                        wenpeng added a softxmax so that each row is a prob vec
                        '''
                        pred_probs = softmax(preds, axis=1)[:, 0]
                        pred_binary_labels_harsh, pred_binary_labels_loose = [], []
                        for i in range(preds.shape[0]):
                            pred_binary_labels_harsh.append(
                                0
                            ) if preds[i][0] > preds[i][
                                1] + 0.1 else pred_binary_labels_harsh.append(
                                    1)
                            pred_binary_labels_loose.append(
                                0) if preds[i][0] > preds[i][
                                    1] else pred_binary_labels_loose.append(1)

                        seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred(
                            pred_probs, pred_binary_labels_harsh,
                            pred_binary_labels_loose, et_label_list,
                            et_hypo_seen_str_indicator, et_hypo_2_type_index,
                            seen_types)
                        # result = compute_metrics('F1', preds, all_label_ids.numpy())
                        loss = train_loss / train_step if args.do_train else None
                        # test_acc = mean_f1#result.get("f1")
                        if unseen_acc > max_et_unseen_acc:
                            max_et_unseen_acc = unseen_acc
                        print(
                            'seen_f1:{} unseen_f1:{} max_unseen_f1:{}'.format(
                                seen_acc, unseen_acc, max_et_unseen_acc))
                        return max_et_unseen_acc
示例#2
0
    def do_eval(eval_features, eval_examples):
        """Do evaluation on the current model."""

        # Logg some information.
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        # Get the eval data and create a sequential dataloader.
        eval_data = create_tensor_dataset(eval_features)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Set the model to eval mode (disable dropout)
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None

        # Iterate over the evaluation data.
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            # Forward pass with deactivated autograd engine.
            with torch.no_grad():
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

            # Calculate eval loss.
            tmp_eval_loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

        # Calculate the mean loss and get all predictions.
        eval_loss = eval_loss / nb_eval_steps
        loss = tr_loss/global_step if args.do_train else None
        preds = preds[0]
        preds = np.argmax(preds, axis=1)
        # Compute the metrics for the given task
        result = compute_metrics(task_name, preds, out_label_ids)

        # Save additional information in the result dict.
        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss
        # Save all settings for external evaluation
        result['_task'] = task_name
        result['_input_mode'] = args.input_to_use
        result['_learning_rate'] = args.learning_rate
        result['_bert-model'] = args.bert_model
        result['_batch_size'] = args.train_batch_size
        result['_warmup'] = args.warmup_proportion
        result['_num_epochs'] = args.num_train_epochs
        result['_seq_len'] = args.max_seq_length
        result['_seed'] = args.seed
        result['_gradient_acc'] = args.gradient_accumulation_steps

        return result, preds
示例#3
0
def compute_td_loss(current_model, target_model, batch_size, replay_buffer,
                    per, use_cpp_buffer, use_async_rb, optimizer, gamma,
                    memory_mgr, robust, **kwargs):
    t = time.time()
    dtype = kwargs['dtype']
    if per:
        buffer_beta = kwargs['buffer_beta']
        if use_async_rb:
            if not replay_buffer.sample_available():
                replay_buffer.async_sample(batch_size, buffer_beta)
            res = replay_buffer.wait_sample()
            replay_buffer.async_sample(batch_size, buffer_beta)
        else:
            res = replay_buffer.sample(batch_size, buffer_beta)
        if use_cpp_buffer:
            state, action, reward, next_state, done, indices, weights = res[
                'obs'], res['act'], res['rew'], res['next_obs'], res[
                    'done'], res['indexes'], res['weights']
        else:
            state, action, reward, next_state, done, weights, indices = res[
                0], res[1], res[2], res[3], res[4], res[5], res[6]
    else:
        if use_async_rb:
            if replay_buffer.sample_available():
                replay_buffer.async_sample(batch_size)
            res = replay_buffer.wait_sample()
            replay_buffer.async_sample(batch_size)
        else:
            res = replay_buffer.sample(batch_size)
        if use_cpp_buffer:
            state, action, reward, next_state, done = res['obs'], res[
                'act'], res['rew'], res['next_obs'], res['done']
        else:
            state, action, reward, next_state, done = res[0], res[1], res[
                2], res[3], res[4]
    if use_cpp_buffer and not use_async_rb:
        action = action.transpose()[0].astype(int)
        reward = reward.transpose()[0].astype(int)
        done = done.transpose()[0].astype(int)
    log_time('sample_time', time.time() - t)

    t = time.time()
    numpy_weights = weights
    if per:
        state, next_state, action, reward, done, weights = memory_mgr.get_cuda_tensors(
            state, next_state, action, reward, done, weights)
    else:
        state, next_state, action, reward, done = memory_mgr.get_cuda_tensors(
            state, next_state, action, reward, done)

    bound_solver = kwargs.get('bound_solver', 'cov')
    optimizer.zero_grad()

    state = state.to(torch.float)
    next_state = next_state.to(torch.float)
    # Normalize input pixel to 0-1
    if dtype in UINTS:
        state /= 255
        next_state /= 255
        state_max = 1.0
        state_min = 0.0
    else:
        state_max = float('inf')
        state_min = float('-inf')
    beta = kwargs.get('beta', 0)

    if robust and bound_solver != 'pgd':
        cur_q_logits = current_model(state, method_opt="forward")
        tgt_next_q_logits = target_model(next_state, method_opt="forward")
    else:
        cur_q_logits = current_model(state)
        tgt_next_q_logits = target_model(next_state)
    if robust:
        eps = kwargs['eps']
    cur_q_value = cur_q_logits.gather(1, action.unsqueeze(1)).squeeze(1)

    tgt_next_q_value = tgt_next_q_logits.max(1)[0]
    expected_q_value = reward + gamma * tgt_next_q_value * (1 - done)
    '''
    # Merge two states into one batch
    state = state.to(torch.float)
    if dtype in UINTS:
        state /= 255
    state_and_next_state = torch.cat((state, next_state), 0)
    logits = current_model(state_and_next_state)
    cur_q_logits = logits[:state.size(0)]
    cur_next_q_logits = logits[state.size(0):]
    tgt_next_q_value  = tgt_next_q_logits.gather(1, torch.max(cur_next_q_logits, 1)[1].unsqueeze(1)).squeeze(1)
    '''

    if kwargs['natural_loss_fn'] == 'huber':
        loss_fn = torch.nn.SmoothL1Loss(reduction='none')
        loss = loss_fn(cur_q_value, expected_q_value.detach())
    else:
        loss = (cur_q_value - expected_q_value.detach()).pow(2)
    if per:
        loss = loss * weights
        prios = loss + 1e-5
        weights_norm = np.linalg.norm(numpy_weights)

    batch_cur_q_value = torch.mean(cur_q_value)
    batch_exp_q_value = torch.mean(expected_q_value)
    loss = loss.mean()
    td_loss = loss.clone()

    if robust:
        if eps < np.finfo(np.float32).tiny:
            reg_loss = torch.zeros(state.size(0))
            if USE_CUDA:
                reg_loss = reg_loss.cuda()
            if bound_solver == 'pgd':
                labels = torch.argmax(cur_q_logits, dim=1).clone().detach()
                adv_margin = ori_margin = logits_margin(
                    current_model.forward(state), labels)
                optimizer.zero_grad()
        else:
            if bound_solver != 'pgd':
                sa = kwargs.get('sa', None)
                pred = cur_q_logits
                labels = torch.argmax(pred, dim=1).clone().detach()
                c = torch.eye(current_model.num_actions).type_as(
                    state)[labels].unsqueeze(1) - torch.eye(
                        current_model.num_actions).type_as(state).unsqueeze(0)
                I = (~(labels.data.unsqueeze(1) == torch.arange(
                    current_model.num_actions).type_as(
                        labels.data).unsqueeze(0)))
                c = (c[I].view(state.size(0), current_model.num_actions - 1,
                               current_model.num_actions))
                sa_labels = sa[labels]
                lb_s = torch.zeros(state.size(0), current_model.num_actions)
                if USE_CUDA:
                    labels = labels.cuda()
                    c = c.cuda()
                    sa_labels = sa_labels.cuda()
                    lb_s = lb_s.cuda()
                env_id = kwargs.get('env_id', '')
                if env_id == 'Acrobot-v1':
                    eps_v = get_acrobot_eps(eps)
                    if USE_CUDA:
                        eps_v = eps_v.cuda()
                else:
                    eps_v = eps
                state_ub = torch.clamp(state + eps_v, max=state_max)
                state_lb = torch.clamp(state - eps_v, min=state_min)

                lb = get_logits_lower_bound(current_model, state, state_ub,
                                            state_lb, eps_v, c, beta)

                hinge = kwargs.get('hinge', False)
                if hinge:
                    reg_loss, _ = torch.min(lb, dim=1)
                    hinge_c = kwargs.get('hinge_c', 1)
                    reg_loss = torch.clamp(reg_loss, max=hinge_c)
                    reg_loss = -reg_loss
                else:
                    lb = lb_s.scatter(1, sa_labels, lb)
                    reg_loss = CrossEntropyLoss()(-lb, labels)
            else:
                labels = torch.argmax(cur_q_logits, dim=1).clone().detach()
                hinge_c = kwargs.get('hinge_c', 1)
                adv_state = attack(current_model, state,
                                   kwargs['attack_config'], logits_margin)
                optimizer.zero_grad()
                adv_margin = logits_margin(current_model.forward(adv_state),
                                           labels)
                ori_margin = logits_margin(current_model.forward(state),
                                           labels)
                reg_loss = torch.clamp(adv_margin, min=-hinge_c)

        if per:
            reg_loss = reg_loss * weights
        reg_loss = reg_loss.mean()
        kappa = kwargs['kappa']
        loss += kappa * reg_loss

    loss.backward()

    # Gradient clipping.
    grad_norm = 0.0
    max_norm = kwargs['grad_clip']
    if max_norm > 0:
        parameters = current_model.parameters()
        for p in parameters:
            grad_norm += p.grad.data.norm(2).item()**2
        grad_norm = np.sqrt(grad_norm)
        clip_coef = max_norm / (grad_norm + 1e-6)
        if clip_coef < 1:
            for p in parameters:
                p.grad.data.mul_(clip_coef)

    # update weights
    optimizer.step()

    nn_time = time.time() - t
    log_time('nn_time', time.time() - t)
    t = time.time()
    if per:
        replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
    log_time('reweight_time', time.time() - t)

    res = (loss, grad_norm, weights_norm, td_loss, batch_cur_q_value,
           batch_exp_q_value)
    if robust:
        if bound_solver == 'pgd':
            res += (ori_margin, adv_margin)
        res += (reg_loss, )
    return res
示例#4
0
def distill(args,
            output_model_file,
            processor,
            label_list,
            tokenizer,
            device,
            n_gpu,
            tensorboard_logger,
            eval_data=None):
    assert args.kd_policy is not None
    model = args.kd_policy.student
    args.kd_policy.teacher.eval()
    num_labels = len(args.labels)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    save_best_model = eval_data is not None and args.eval_interval > 0

    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
    optimizer, t_total = get_optimizer(args, model, num_train_steps)

    train_data = prepare(args, processor, label_list, tokenizer, 'train')
    logger.info("***** Running distillation *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    train_steps = 0
    best_eval_accuracy = 0
    for epoch in trange(int(args.num_train_epochs), desc="Epoch", dynamic_ncols=True):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        args.kd_policy.on_epoch_begin(model, None, None)

        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            model.train()
            logits = args.kd_policy.forward(input_ids, segment_ids, input_mask)
            loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))
            loss = args.kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            train_steps += 1
            tensorboard_logger.add_scalar('distillation_train_loss', loss.item(), train_steps)

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if save_best_model and train_steps % args.eval_interval == 0:
                eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False)
                tensorboard_logger.add_scalar('distillation_dev_loss', eval_loss, train_steps)
                tensorboard_logger.add_scalar('distillation_dev_accuracy', eval_accuracy, train_steps)
                if eval_accuracy > best_eval_accuracy:
                    save_model(model, output_model_file)
                    best_eval_accuracy = eval_accuracy

        args.kd_policy.on_epoch_end(model, None, None)

    if save_best_model:
        eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False)
        if eval_accuracy > best_eval_accuracy:
            save_model(model, output_model_file)
    else:
        save_model(model, output_model_file)

    return global_step, tr_loss / nb_tr_steps
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="输入数据dir。应该包含任务的.tsv文件(或其他数据文件)。")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help=
        "Bert pre-trained model selected in the list: bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="训练任务的名称")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="将写入模型预测和checkpoints的输出目录。 ")
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="您希望将从s3下载的预训练模型存储在何处")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="WordPiece tokenization 后输入序列的最大总长度,大于这个的序列将被截断,小于的padded")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="如果您使用的是uncased模型,请设置此标志。")
    parser.add_argument("--train_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=256,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    # ??????????????????????????????
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training."
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n"
    )
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    # if args.server_ip and args.server_port:
    #     # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
    #     import ptvsd
    #     print("Waiting for debugger attach")
    #     ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
    #     ptvsd.wait_for_attach()

    processors = {
        # "cola": ColaProcessor,
        # "mnli": MnliProcessor,
        # "mnli-mm": MnliMismatchedProcessor,
        # "mrpc": MrpcProcessor,
        # "sst-2": Sst2Processor,
        # "sts-b": StsbProcessor,
        # "qqp": QqpProcessor,
        # "qnli": QnliProcessor,
        "rte": RteProcessor
        # "wnli": WnliProcessor,
    }

    output_modes = {
        # "cola": "classification",
        # "mnli": "classification",
        # "mrpc": "classification",
        # "sst-2": "classification",
        # "sts-b": "regression",
        # "qqp": "classification",
        # "qnli": "classification",
        "rte": "classification"
        # "wnli": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:  # 未指定GPU,或无GPU
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:  # 分布式
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1  # ??????????多GPU???????
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # ?????单GPU没有分布式??????
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # 如果显存不足,假设原来的batch size=10,数据总量为1000,那么一共需要100train steps,同时一共进行100次梯度更新。
    # 若是显存不够,我们需要减小batch size,我们设置gradient_accumulation_steps=2,那么我们新的batch_size=10/2=5,
    # 我们需要运行两次,才能在内存中放入10条数据,梯度更新的次数不变为100次,那么我们的train_steps=200
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:  # 多GPU
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()  # RteProcessor
    output_mode = output_modes[task_name]  # "classification"

    label_list = processor.get_labels()  # ["entailment", "not_entailment"]
    num_labels = len(label_list)

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(
            args.local_rank))
    # model = BertForSequenceClassification.from_pretrained(args.bert_model,
    #           cache_dir=cache_dir,
    #           num_labels=num_labels)
    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=num_labels)  # 2个标签
    if args.fp16:
        model.half()
    model.to(device)
    if n_gpu > 1:  # 多GPU
        model = torch.nn.DataParallel(model)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=args.do_lower_case)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']  # 不weight_decay
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    # nd 在不在 n 中如果在把p放进去
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    if args.do_train:
        num_train_steps = None
        # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv')
        train_examples, seen_types = processor.get_examples_Wikipedia_train(
            '/home/zut_csi/tomding/zs/BenchmarkingZeroShotData/tokenized_wiki2categories.txt',
            100000)
        # /export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki2categories.txt', 100000) #train_pu_half_v1.txt
        # seen_classes=[0,2,4,6,8]
        eval_examples, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index = processor.get_examples_emotion_test(
            '/home/zut_csi/tomding/zs/BenchmarkingZeroShot/emotion/dev.txt',
            seen_types)
        # /export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/dev.txt', seen_types)
        test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_emotion_test(
            '/home/zut_csi/tomding/zs/BenchmarkingZeroShot/emotion/test.txt',
            seen_types)
        # /export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/test.txt', seen_types)

        train_features, eval_features, test_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length,
            tokenizer, output_mode), convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer,
                output_mode), convert_examples_to_features(
                    test_examples, label_list, args.max_seq_length, tokenizer,
                    output_mode)
        all_input_ids, eval_all_input_ids, test_all_input_ids = torch.tensor(
            [f.input_ids for f in train_features],
            dtype=torch.long), torch.tensor(
                [f.input_ids for f in eval_features],
                dtype=torch.long), torch.tensor(
                    [f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask, eval_all_input_mask, test_all_input_mask = torch.tensor(
            [f.input_mask for f in train_features],
            dtype=torch.long), torch.tensor(
                [f.input_mask for f in eval_features],
                dtype=torch.long), torch.tensor(
                    [f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids, eval_all_segment_ids, test_all_segment_ids = torch.tensor(
            [f.segment_ids for f in train_features],
            dtype=torch.long), torch.tensor(
                [f.segment_ids for f in eval_features],
                dtype=torch.long), torch.tensor(
                    [f.segment_ids for f in test_features], dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)
        eval_all_label_ids, test_all_label_ids = torch.tensor(
            [f.label_id for f in eval_features],
            dtype=torch.long), torch.tensor(
                [f.label_id for f in test_features], dtype=torch.long)
        train_data, eval_data, test_data = TensorDataset(
            all_input_ids, all_input_mask,
            all_segment_ids, all_label_ids), TensorDataset(
                eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids,
                eval_all_label_ids), TensorDataset(test_all_input_ids,
                                                   test_all_input_mask,
                                                   test_all_segment_ids,
                                                   test_all_label_ids)
        train_sampler, eval_sampler, test_sampler = RandomSampler(
            train_data), SequentialSampler(eval_data), SequentialSampler(
                test_data)
        eval_dataloader, test_dataloader, train_dataloader = DataLoader(
            eval_data, sampler=eval_sampler,
            batch_size=args.eval_batch_size), DataLoader(
                test_data,
                sampler=test_sampler,
                batch_size=args.eval_batch_size), DataLoader(
                    train_data,
                    sampler=train_sampler,
                    batch_size=args.train_batch_size)

        # ??????????????batch_size 已经除 args.gradient_accumulation_steps?????????????????
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_steps = num_train_steps // torch.distributed.get_world_size(
            )  # 全局的整个的进程数

        max_test_unseen_acc, max_dev_unseen_acc, max_dev_seen_acc, max_overall_acc = 0.0, 0.0, 0.0, 0.0  #
        logger.info(
            '******************************************************  Running_training  ***************************************************'
        )
        logger.info("Num_examples:{} Batch_size:{} Num_steps:{}".format(
            len(train_examples), args.train_batch_size, num_train_steps))
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            train_loss = 0
            for train_step, batch_data in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()

                batch_data = tuple(b.to(device) for b in batch_data)
                input_ids, input_mask, segment_ids, label_ids = batch_data
                logits = model(input_ids, segment_ids, input_mask,
                               labels=None)[0]
                tmp_train_loss = CrossEntropyLoss()(logits.view(
                    -1, num_labels), label_ids.view(-1))
                if n_gpu > 1:  # 多GPU
                    tmp_train_loss = tmp_train_loss.mean(
                    )  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    tmp_train_loss = tmp_train_loss / args.gradient_accumulation_steps
                tmp_train_loss.backward()
                train_loss += tmp_train_loss.item()

                optimizer.step()
                optimizer.zero_grad()

                if (train_step + 1
                    ) % 200 == 0:  # start evaluate on dev set after this epoch

                    def et(et_dataloader, max_et_unseen_acc, et_label_list,
                           et_hypo_seen_str_indicator, et_hypo_2_type_index):
                        model.eval()
                        et_loss, et_step, preds = 0, 0, []
                        for input_ids, input_mask, segment_ids, label_ids in et_dataloader:
                            input_ids, input_mask, segment_ids, label_ids = input_ids.to(
                                device), input_mask.to(device), segment_ids.to(
                                    device), label_ids.to(device)
                            with torch.no_grad():
                                logits = model(input_ids,
                                               segment_ids,
                                               input_mask,
                                               labels=None)[0]
                            tmp_et_loss = CrossEntropyLoss()(logits.view(
                                -1, num_labels), label_ids.view(-1))
                            et_loss += tmp_et_loss.mean().item()
                            et_step += 1
                            if len(preds) == 0:
                                preds.append(logits.detach().cpu().numpy())
                                # 进行反向传播时,到该调用detach()的Variable就会停止,不能再继续向前进行传播.
                                # cpu()函数作用是将数据从GPU上复制到memory上,相对应的函数是cuda()
                            else:
                                preds[0] = np.append(
                                    preds[0],
                                    logits.detach().cpu().numpy(),
                                    axis=0)
                        et_loss = et_loss / et_step
                        preds = preds[0]
                        '''
                        preds: size*2 (entail, not_entail)
                        wenpeng added a softxmax so that each row is a prob vec
                        '''
                        pred_probs = softmax(preds, axis=1)[:, 0]
                        pred_binary_labels_harsh, pred_binary_labels_loose = [], []
                        for i in range(preds.shape[0]):
                            pred_binary_labels_harsh.append(
                                0
                            ) if preds[i][0] > preds[i][
                                1] + 0.1 else pred_binary_labels_harsh.append(
                                    1)
                            pred_binary_labels_loose.append(
                                0) if preds[i][0] > preds[i][
                                    1] else pred_binary_labels_loose.append(1)

                        seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred(
                            pred_probs, pred_binary_labels_harsh,
                            pred_binary_labels_loose, et_label_list,
                            et_hypo_seen_str_indicator, et_hypo_2_type_index,
                            seen_types)
                        # result = compute_metrics('F1', preds, all_label_ids.numpy())
                        loss = train_loss / train_step if args.do_train else None
                        # test_acc = mean_f1#result.get("f1")
                        if unseen_acc > max_et_unseen_acc:
                            max_et_unseen_acc = unseen_acc
                        print(
                            'seen_f1:{} unseen_f1:{} max_unseen_f1:{}'.format(
                                seen_acc, unseen_acc, max_et_unseen_acc))
                        return max_et_unseen_acc

                    # if seen_acc+unseen_acc > max_overall_acc:
                    #     max_overall_acc = seen_acc + unseen_acc
                    # if seen_acc > max_dev_seen_acc:
                    #     max_dev_seen_acc = seen_acc

                    logger.info(
                        '*********************  Running evaluation  *********************'
                    )
                    logger.info("Num_examples:{} Batch_size:{}".format(
                        len(eval_examples), args.eval_batch_size))
                    max_dev_unseen_acc = et(eval_dataloader,
                                            max_dev_unseen_acc,
                                            eval_label_list,
                                            eval_hypo_seen_str_indicator,
                                            eval_hypo_2_type_index)
                    logger.info(
                        '*********************    Running testing   *********************'
                    )
                    logger.info("Num_examples:{} Batch_size:{}".format(
                        len(test_examples), args.eval_batch_size))
                    max_test_unseen_acc = et(test_dataloader,
                                             max_test_unseen_acc,
                                             test_label_list,
                                             test_hypo_seen_str_indicator,
                                             test_hypo_2_type_index)