예제 #1
0
def meta_train(args, gold_ratio):
    """ Train the model """
    best_acc = 0.
    best_f1 = 0.
    best_loss_val = 100000
    val_acc_and_f1 = 0.
    best_cm = ""
    fake_acc_and_f1 = 0.
    fake_best_f1 = 0.
    fake_best_acc = 0.
    writer = None
    tokenizer, model = build_model(args)
    g_dataset = load_fake_news(args,
                               tokenizer,
                               evaluate=False,
                               train_path=args.gold_train_path)
    s_dataset = load_fake_news(args,
                               tokenizer,
                               evaluate=False,
                               train_path=args.silver_train_path,
                               is_weak=True,
                               weak_type=args.weak_type)
    val_dataset = load_fake_news(args,
                                 tokenizer,
                                 evaluate=False,
                                 train_path=args.val_path)

    eval_dataset = copy.deepcopy(val_dataset)

    # make a copy of train and test towards similar size as the weak source
    if True:
        max_length = max(len(g_dataset), len(s_dataset), len(val_dataset))
        g_dataset = torch.utils.data.ConcatDataset(
            [g_dataset] * int(max_length / len(g_dataset)))
        s_dataset = torch.utils.data.ConcatDataset(
            [s_dataset] * int(max_length / len(s_dataset)))
        val_dataset = torch.utils.data.ConcatDataset(
            [val_dataset] * int(max_length / len(val_dataset)))

    g_sampler = RandomSampler(val_dataset)
    g_dataloader = DataLoader(val_dataset,
                              sampler=g_sampler,
                              batch_size=args.g_train_batch_size)

    train_sampler = RandomSampler(g_dataset)
    train_dataloader = DataLoader(g_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.g_train_batch_size)

    s_sampler = RandomSampler(s_dataset)
    s_dataloader = DataLoader(s_dataset,
                              sampler=s_sampler,
                              batch_size=args.s_train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(g_dataloader)) + 1
    else:
        if gold_ratio == 0:
            t_total = min(len(g_dataloader),
                          len(s_dataloader)) * args.num_train_epochs
        else:
            t_total = min(len(g_dataloader), len(train_dataloader),
                          len(s_dataloader)) * args.num_train_epochs

    if args.clf_model is not "cnn":
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay":
                0.0
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, t_total / args.num_train_epochs)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.use_group_weight or args.use_group_net:
        #
        if args.use_group_weight:
            group_weight = GroupWeightModel(n_groups=args.multi_head)
        else:
            group_weight = FullWeightModel(n_groups=args.multi_head,
                                           hidden_size=args.hidden_size)
        group_weight = group_weight.to(args.device)
        parameters = [i for i in group_weight.parameters() if i.requires_grad]
        if "adam" in args.group_opt.lower():

            if "w" in args.group_opt.lower():
                group_optimizer = AdamW(parameters,
                                        lr=args.group_lr,
                                        eps=args.group_adam_epsilon,
                                        weight_decay=args.group_weight_decay)
            else:
                group_optimizer = torch.optim.Adam(
                    parameters,
                    lr=args.group_lr,
                    eps=args.group_adam_epsilon,
                    weight_decay=args.group_weight_decay)

            group_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                group_optimizer, t_total / args.num_train_epochs)
            # group_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
            #                                                   num_training_steps=t_total)
        elif args.group_opt.lower() == "sgd":
            group_optimizer = torch.optim.SGD(
                parameters,
                lr=args.group_lr,
                momentum=args.group_momentum,
                weight_decay=args.group_weight_decay)
            group_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                group_optimizer, 'min')

        if args.fp16:
            group_weight, group_optimizer = amp.initialize(
                group_weight, group_optimizer, opt_level=args.fp16_opt_level)

    # # Train!
    logger.info("***** Running training *****")
    logger.info("  Num Gold examples = %d, Silver Examples = %d",
                len(val_dataset), len(s_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d, %d",
        args.g_train_batch_size, args.s_train_batch_size)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint

    g_loss, logging_g_loss, logging_s_loss, s_loss = 0.0, 0.0, 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch")
    set_seed(args)  # Added here for reproductibility
    temp_output = open(args.flat_output_file + "_step", "w+", 1)
    for _ in train_iterator:
        be_changed = False
        for step, (g_batch, s_batch, train_batch) in enumerate(
                zip(g_dataloader, s_dataloader, train_dataloader)):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            g_batch = tuple(t.to(args.device) for t in g_batch)
            g_input = {
                "input_ids": g_batch[0],
                "attention_mask": g_batch[1],
                "labels": g_batch[2]
            }

            s_batch = tuple(t.to(args.device) for t in s_batch)
            s_input = {
                "input_ids": s_batch[0],
                "attention_mask": s_batch[1],
                "labels": s_batch[2],
                "reduction": 'none'
            }

            train_batch = tuple(t.to(args.device) for t in train_batch)
            train_input = {
                "input_ids": train_batch[0],
                "attention_mask": train_batch[1],
                "labels": train_batch[2]
            }
            # ATTENTION: RoBERTa does not need token types id
            if args.multi_head > 1:
                s_input.update({"is_gold": False})

            if (global_step + 1) % args.logging_steps == 0:
                step_input = global_step
            else:
                step_input = None
            info = {"gold_ratio": gold_ratio, "step": step_input}

            if args.use_group_net:
                outputs = step_l2w_group_net(model, optimizer, scheduler,
                                             g_input, s_input, train_input,
                                             args, group_weight,
                                             group_optimizer, group_scheduler,
                                             gold_ratio)

                loss_g, loss_s, instance_weight = outputs
            else:
                outputs = step_l2w(model, optimizer, scheduler, g_input,
                                   s_input, train_input, args, gold_ratio)
                loss_g, loss_s = outputs

            g_loss += loss_g.item()
            s_loss += loss_s.item()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                logs = {}
                results = {}
                if (args.evaluate_during_training) or True:

                    results = evaluate(args,
                                       model,
                                       tokenizer,
                                       gold_ratio,
                                       eval_dataset=eval_dataset)
                    results = {
                        key + "_val": value
                        for key, value in results.items()
                    }
                    results.update({"type": "val"})
                    print(json.dumps(results))
                    if val_acc_and_f1 < results['acc_and_f1_val']:
                        be_changed = True
                        best_loss_val = results['loss_val']
                        val_acc_and_f1 = results['acc_and_f1_val']
                        test_results = evaluate(args, model, tokenizer,
                                                gold_ratio)
                        best_acc = test_results['acc']
                        best_f1 = test_results['f1']
                        best_cm = test_results['c_m']
                        best_acc_and_f1 = test_results["acc_and_f1"]
                        temp_output.write(
                            "Step: {}, Test F1: {}, Test ACC: {}; Val Acc_and_F1: {}, Val Loss: {}\n"
                            .format(global_step, best_f1, best_acc,
                                    val_acc_and_f1, best_loss_val))
                        temp_output.flush()
                        # save the model
                        if args.save_model:
                            save_path = args.flat_output_file + "_save_model"
                            save_dic = {
                                "BaseModel": model,
                                "LWN": group_weight,
                                "step": global_step,
                                "tokenizer": tokenizer
                            }
                            torch.save(save_dic, save_path)

                        test_results = {
                            key + "_test": value
                            for key, value in test_results.items()
                        }
                        test_results.update({"type": "test"})
                        print(json.dumps(test_results))
                    for key, value in results.items():
                        eval_key = "eval_{}".format(key)
                        logs[eval_key] = value

                loss_scalar = (g_loss - logging_g_loss) / args.logging_steps
                learning_rate_scalar = optimizer.defaults.get("lr", 0)
                logs["train_learning_rate"] = learning_rate_scalar
                logs["train_g_loss"] = loss_scalar
                logs["train_s_loss"] = (s_loss -
                                        logging_s_loss) / args.logging_steps
                logging_g_loss = g_loss
                logging_s_loss = s_loss

                # writer.add_scalar("Loss/g_train_{}".format(gold_ratio), logs['train_g_loss'], global_step)
                # writer.add_scalar("Loss/s_train_{}".format(gold_ratio), logs['train_s_loss'], global_step)
                # writer.add_scalar("Loss/val_train_{}".format(gold_ratio), results['loss_val'], global_step)

                if args.use_group_weight:
                    try:
                        eta_group = group_optimizer.get_lr()
                    except:
                        eta_group = group_optimizer.defaults.get("lr", 0)

                        # writer.add_scalar("Loss/group_lr_{}".format(gold_ratio), eta_group, global_step)

                print(json.dumps({**{"step": global_step}, **logs}))

            if args.max_steps > 0 and global_step > args.max_steps:
                break
        if (args.use_group_net or args.use_group_weight) and isinstance(
                group_scheduler, torch.optim.lr_scheduler.CosineAnnealingLR):
            group_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                group_optimizer, t_total / args.num_train_epochs)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, t_total / args.num_train_epochs)

        print("EPOCH Finish")
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    temp_output.close()
    # return cache_instance_weight
    return global_step, g_loss / global_step, (best_f1, best_acc, best_cm)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(story[:max_length]) +
        max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
        for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    # Max size of input for the pre-trained model
    input_length = min(input_length, model.config.n_positions)

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          warmup=args.warmup_proportion,
                          max_grad_norm=args.max_grad_norm,
                          weight_decay=args.weight_decay,
                          t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * \
                    exp_average_loss+0.3*loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))