Пример #1
0
def main(args):
    args.data_dir = os.path.join(args.data_dir, args.task_name)
    args.output_dir = os.path.join(args.output_dir, args.task_name)
    logger.info("args = %s", args)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
        "emo": EmoProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
        "emo": "classification"
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

        # device = torch.device('cpu')

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        logger.info("Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        try:
            os.makedirs(args.output_dir)
        except:
            pass
            logger.info("catch a error")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # tokenizer = BertTokenizer.from_pretrained(args.vocab_file, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))

    # use bert to aug train_examples
    ori_train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)

    num_train_optimization_steps = int(
        len(ori_train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    if args.use_saved == 1:
        bert_saved_dir = args.ckpt
        model = BertForNSPAug.from_pretrained(bert_saved_dir,
                                              cache_dir=args.ckpt_cache_dir,
                                              num_labels=num_labels,
                                              args=args)
    else:
        model = BertForNSPAug.from_pretrained(args.bert_model,
                                              cache_dir=cache_dir,
                                              num_labels=num_labels,
                                              args=args)
    model.cuda()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        global_step = 0
        best_val_acc = 0.0
        first_time = time.time()

        logger.info(
            "*********************************** Running training ***********************************"
        )
        logger.info("  Num original examples = %d", len(ori_train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        aug_ratio = 0.0
        # aug_ratio = 0.2
        aug_seed = np.random.randint(0, 1000)
        for epoch in range(int(args.num_train_epochs)):
            logger.info("epoch=%d,  aug_ratio = %f,  aug_seed=%d", epoch,
                        aug_ratio, aug_seed)
            train_examples = Aug_each_ckpt(ori_train_examples,
                                           label_list,
                                           model,
                                           tokenizer,
                                           args=args,
                                           num_show=args.num_show,
                                           output_mode=output_mode,
                                           seed=aug_seed,
                                           aug_ratio=aug_ratio,
                                           use_bert=False)
            if aug_ratio + args.aug_ratio_each < 1.0:
                aug_ratio += args.aug_ratio_each
            aug_seed += 1

            train_features = convert_examples_to_features(
                train_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                num_show=args.num_show,
                output_mode=output_mode,
                args=args)
            logger.info(
                "*********************************** Done convert features ***********************************"
            )
            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_mode == "classification":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float)

            token_real_label = torch.tensor(
                [f.token_real_label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids,
                                       token_real_label)
            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            logger.info(
                "*********************************** begin training ***********************************"
            )
            tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0
            nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0
            preds = []
            all_labels = []
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, segment_ids, label_ids, token_real_label = batch
                seq_logits, aug_logits, aug_loss = model(
                    input_ids,
                    segment_ids,
                    input_mask,
                    labels=None,
                    token_real_label=token_real_label)
                if output_mode == "classification":
                    # if task_name == "emo":
                    #     loss_fct =
                    # else:
                    loss_fct = CrossEntropyLoss()
                    seq_loss = loss_fct(seq_logits.view(-1, num_labels),
                                        label_ids.view(-1))
                    # print("[classification]label_ids: {}, size: {}".format(label_ids.view(-1), label_ids.view(-1).size()))
                    # print("[classification]seq_logits size: {}".format(seq_logits.view(-1, num_labels).size()))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    seq_loss = loss_fct(seq_logits.view(-1),
                                        label_ids.view(-1))

                token_real_label = token_real_label.detach().cpu().numpy()

                w = args.aug_loss_weight
                loss = (1 - w) * seq_loss + w * aug_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                total_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 10000.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                batch_loss = seq_loss.mean().item()
                tr_seq_loss += seq_loss.mean().item()
                seq_logits = seq_logits.detach().cpu().numpy()
                label_ids = label_ids.detach().cpu().numpy()
                if len(preds) == 0:
                    preds.append(seq_logits)
                    all_labels.append(label_ids)
                else:
                    preds[0] = np.append(preds[0], seq_logits, axis=0)
                    all_labels[0] = np.append(all_labels[0], label_ids, axis=0)

                aug_logits = aug_logits.detach().cpu().numpy()
                tmp_train_aug_accuracy, tmp_tokens = accuracy(aug_logits,
                                                              token_real_label,
                                                              type="aug")
                train_aug_accuracy += tmp_train_aug_accuracy
                nb_tr_tokens += tmp_tokens
                tr_aug_loss += aug_loss.mean().item()

                if global_step % 20 == 0:
                    loss = tr_loss / nb_tr_steps
                    seq_loss = tr_seq_loss / nb_tr_steps
                    aug_loss = tr_aug_loss / nb_tr_steps
                    tmp_pred = preds[0]
                    tmp_labels = all_labels[0]
                    if output_mode == "classification":
                        tmp_pred = np.argmax(tmp_pred, axis=1)
                    elif output_mode == "regression":
                        tmp_pred = np.squeeze(tmp_pred)
                    res = accuracy(tmp_pred, tmp_labels, task_name=task_name)

                    if nb_tr_tokens != 0:
                        aug_avg = train_aug_accuracy / nb_tr_tokens
                    else:
                        aug_avg = 0.0
                    log_string = ""
                    log_string += "epoch={:<5d}".format(epoch)
                    log_string += " step={:<9d}".format(global_step)
                    log_string += " total_loss={:<9.7f}".format(loss)
                    log_string += " seq_loss={:<9.7f}".format(seq_loss)
                    log_string += " aug_loss={:<9.7f}".format(aug_loss)
                    log_string += " batch_loss={:<9.7f}".format(batch_loss)
                    log_string += " lr={:<9.7f}".format(optimizer.get_lr()[0])
                    log_string += " |g|={:<9.7f}".format(total_norm)
                    #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg)
                    log_string += " tr_aug_acc={:<9.7f}".format(aug_avg)
                    log_string += " mins={:<9.2f}".format(
                        float(time.time() - first_time) / 60)
                    for key in sorted(res.keys()):
                        log_string += "  " + key + "= " + str(res[key])
                    logger.info(log_string)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            train_loss = tr_loss / nb_tr_steps

            logger.info(
                "*********************************** training epoch done ***********************************"
            )

            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank()
                                 == 0) and epoch % 1 == 0:
                tot_time = float(time.time() - first_time) / 60
                eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts=\
                 do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev")

                eval_res["tot_time"] = tot_time
                if "acc" in eval_res:
                    tmp_acc = eval_res["acc"]
                elif "mcc" in eval_res:
                    tmp_acc = eval_res["mcc"]
                else:
                    tmp_acc = eval_res["corr"]

                result = {
                    'eval_total_loss': eval_loss,
                    'eval_seq_loss': eval_seq_loss,
                    'eval_aug_loss': eval_aug_loss,
                    'eval_aug_accuracy': eval_aug_accuracy,
                    'global_step': global_step,
                    'train_loss': train_loss,
                    'train_batch_size': args.train_batch_size,
                    'args': args
                }

                if tmp_acc >= best_val_acc:
                    best_val_acc = tmp_acc
                    dev_test = "dev"
                    result.update({'best_epoch': epoch})

                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_dir = os.path.join(args.output_dir,
                                                    "dev_" + str(tmp_acc))
                    if not os.path.exists(output_model_dir):
                        os.makedirs(output_model_dir)
                    output_model_file = os.path.join(output_model_dir,
                                                     WEIGHTS_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    output_config_file = os.path.join(output_model_dir,
                                                      CONFIG_NAME)
                    with open(output_config_file, 'w') as f:
                        f.write(model_to_save.config.to_json_string())

                result.update(eval_res)
                result.update(res_parts)

                # output_eval_file = os.path.join(args.output_dir,
                # 								dev_test + "_results_" + str(tmp_acc) + ".txt")
                # with open(output_eval_file, "w") as writer:
                logger.info(
                    "****************************** eval results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    # writer.write("%s = %s\n" % (key, str(result[key])))
            else:
                result = {
                    'eval_total_loss': eval_loss,
                    'eval_seq_loss': eval_seq_loss,
                    'eval_aug_loss': eval_aug_loss,
                    'eval_aug_accuracy': eval_aug_accuracy,
                    'global_step': global_step,
                    'train_loss': train_loss,
                    'train_batch_size': args.train_batch_size,
                    'args': args
                }

                result.update(eval_res)
                result.update(res_parts)
                logger.info(
                    "****************************** eval results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

            # write test results
            if args.do_test:
                # res_file = os.path.join(args.output_dir,
                # 							"test_" + str(tmp_acc)+".tsv")

                # idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model)

                # dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds})
                # dataframe.to_csv(res_file, index=False, sep='\t')
                # logger.info("  Num test length = %d", idx)
                logger.info(
                    "*********************************** Running test ***********************************"
                )
                logger.info("  Num examples = %d", len(test_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                test_loss, test_seq_loss, test_aug_loss, test_res, test_aug_accuracy, res_parts=\
                 do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, test_examples, type="test")
                result = {
                    'test_total_loss': test_loss,
                    'test_seq_loss': test_seq_loss,
                    'test_aug_loss': test_aug_loss,
                    'test_aug_accuracy': test_aug_accuracy,
                    'global_step': global_step,
                    'args': args
                }
                result.update(test_res)
                result.update(res_parts)

                logger.info(
                    "****************************** test results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

                logger.info(
                    "*********************************** test done ***********************************"
                )
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default='pretrained/bert-base-uncased',
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default='tasks/QuestionAnswering/squad_output',
        type=str,
        required=False,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--train_file",
        default='tasks/QuestionAnswering/squad_data/train-v1.1.json',
        type=str,
        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default='tasks/QuestionAnswering/squad_data/dev-v1.1.json',
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The size of vocabulary.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument(
        "--do_train", default=1, type=int,
        help="Whether to run training.")  # , action='store_true'
    parser.add_argument(
        "--do_predict",
        default=1,
        type=int,
        help="Whether to run eval on the dev set.")  # , action='store_true'
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=2.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to squad_data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=83,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=1,
        type=int,
        # action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    args.train_batch_size = int(args.train_batch_size)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_data, dev_data = load_dataset(args)

    # Prepare model
    config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r"))
    model = BertQA(args.vocab_size, **config)
    model.load(os.path.join(args.bert_model, MODEL_NAME))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = args.num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if args.do_train:
        criterion = nn.CrossEntropyLoss()

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(
                    t.to(device)
                    for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)
                logits_start = logits['pred_start']
                logtis_end = logits['pred_end']

                ignored_index = logits_start.size(1)
                start_positions.clamp_(0, ignored_index)
                end_positions.clamp_(0, ignored_index)

                loss = (criterion(logits_start, start_positions) +
                        criterion(logtis_end, end_positions)) / 2

                if n_gpu > 1:
                    loss = loss.mean()

                loss.backward()
                if (step + 1) % 1 == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model.load(output_model_file)
    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        # Run prediction for full squad_data
        eval_sampler = SequentialSampler(dev_data)
        eval_dataloader = DataLoader(dev_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)
                batch_start_logits = logits['pred_start']
                batch_end_logits = logits['pred_end']

            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default="../bert_pytorch/tasks/MultipleChoice/swag_data/",
        type=str,
        required=False,
        help=
        "The input squad_data dir. Should contain the .csv files (or other squad_data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default='converted/base-uncased',
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default='tasks/MultipleChoice/swag_output/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=80,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The size of vocabulary.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=4,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    ###### config setting ######

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    ###### fastNLP.DataSet loading ######

    train_data, dev_data = load_dataset(args)

    ###### model initializing ######

    config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r"))
    model = BertMC(args.vocab_size, num_choices=4, **config)
    model.load(os.path.join(args.bert_model, MODEL_NAME))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    ###### ptimizer initializing ######

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = args.num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if args.do_train:
        criterion = nn.CrossEntropyLoss()
        train_dataloader = DataLoader(train_data,
                                      sampler=RandomSampler(train_data),
                                      batch_size=args.train_batch_size)
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)['pred']

                loss = criterion(logits, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model.load(output_model_file)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        eval_dataloader = DataLoader(dev_data,
                                     sampler=SequentialSampler(dev_data),
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                # TODO
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)['pred']
                tmp_eval_loss = criterion(logits, label_ids)
                if n_gpu > 1:
                    tmp_eval_loss = tmp_eval_loss.mean()

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Пример #4
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--word_embedding_file",
                        default='emb/crawl-300d-2M.vec',
                        type=str,
                        help="The input directory of word embeddings.")
    parser.add_argument("--index_path",
                        default='emb/p_index.bin',
                        type=str,
                        help="The input directory of word embedding index.")
    parser.add_argument("--word_embedding_info",
                        default='emb/vocab_info.txt',
                        type=str,
                        help="The input directory of word embedding info.")
    parser.add_argument("--data_file",
                        default='',
                        type=str,
                        help="The input directory of input data file.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--max_ngram_length",
                        default=16,
                        type=int,
                        help="The maximum total ngram sequence")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--embedding_size",
                        default=300,
                        type=int,
                        help="Total batch size for embeddings.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--num_eval_epochs',
        type=int,
        default=0,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--single',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    logger.info("loading embeddings ... ")
    if args.do_train:
        emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(
            args.word_embedding_file)
        write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list)
    if args.do_eval:
        emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info)
        #emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(args.word_embedding_file)
        #write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list)
    logger.info("loading p index ...")
    if not os.path.exists(args.index_path):
        p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec,
                                           args.index_path)
    else:
        p = load_embedding_index(args.index_path,
                                 emb_vocab_size,
                                 num_dim=args.embedding_size)

    train_examples = None
    num_train_optimization_steps = None
    w2i, i2w, vocab_size = {}, {}, 1
    if args.do_train:

        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

        train_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_train(\
            train_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, emb_dict)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_ngram_ids = torch.tensor([f.ngram_ids for f in train_features],
                                     dtype=torch.long)
        all_ngram_labels = torch.tensor(
            [f.ngram_labels for f in train_features], dtype=torch.long)
        all_ngram_masks = torch.tensor([f.ngram_masks for f in train_features],
                                       dtype=torch.long)
        all_ngram_embeddings = torch.tensor(
            [f.ngram_embeddings for f in train_features], dtype=torch.float)

        # Prepare model
        cache_dir = args.cache_dir if args.cache_dir else os.path.join(
            PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
                args.local_rank))
        model = BertForNgramClassification.from_pretrained(
            args.bert_model,
            cache_dir=cache_dir,
            num_labels=num_labels,
            embedding_size=args.embedding_size,
            max_seq_length=args.max_seq_length,
            max_ngram_length=args.max_ngram_length)
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0

        #if args.do_train:

        train_data = TensorDataset(all_ngram_ids, all_ngram_labels,
                                   all_ngram_masks, all_ngram_embeddings)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ind in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                ngram_ids, ngram_labels, ngram_masks, ngram_embeddings = batch
                loss = model(ngram_ids, ngram_masks, ngram_embeddings)
                if n_gpu > 1:
                    loss = loss.mean()

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()

                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'loss': loss,
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "train_results.txt")
            with open(output_eval_file, "a") as writer:
                #logger.info("***** Training results *****")
                writer.write("epoch" + str(ind) + '\n')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('\n')

            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir,
                                             "epoch" + str(ind) + WEIGHTS_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

    # Load a trained model and config that you have fine-tuned
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        eval_examples = processor.get_gnrt_dev_examples(args.data_file)
        eval_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_eval(
            eval_examples, label_list, args.max_seq_length,
            args.max_ngram_length, tokenizer, w2i, i2w, vocab_size)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_token_ids = torch.tensor([f.token_ids for f in eval_features],
                                     dtype=torch.long)
        # all_flaw_labels: indexes of wrong words predicted by disc
        all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features],
                                       dtype=torch.long)
        all_ngram_ids = torch.tensor([f.ngram_ids for f in eval_features],
                                     dtype=torch.long)
        all_ngram_mask = torch.tensor([f.ngram_mask for f in eval_features],
                                      dtype=torch.long)
        all_ngram_labels = torch.tensor(
            [f.ngram_labels for f in eval_features], dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in eval_features],
                                    dtype=torch.long)

        eval_data = TensorDataset(all_token_ids, all_ngram_ids, all_ngram_mask,
                                  all_ngram_labels, all_label_id,
                                  all_flaw_labels)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        if args.single:
            eval_range = trange(int(args.num_eval_epochs),
                                int(args.num_eval_epochs + 1),
                                desc="Epoch")
        else:
            eval_range = trange(int(args.num_eval_epochs), desc="Epoch")

        for epoch in eval_range:

            output_file = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "gnrt_outputs.tsv")
            with open(output_file, "w") as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                writer.writerow(["sentence", "label"])

            output_model_file = os.path.join(
                args.output_dir, "epoch" + str(epoch) + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            config = BertConfig(output_config_file)
            model = BertForNgramClassification(
                config,
                num_labels=num_labels,
                embedding_size=args.embedding_size,
                max_seq_length=args.max_seq_length,
                max_ngram_length=args.max_ngram_length)
            model.load_state_dict(torch.load(output_model_file))
            model.to(device)
            model.eval()

            for token_ids, ngram_ids, ngram_mask, ngram_labels, label_id, flaw_labels in tqdm(
                    eval_dataloader, desc="Evaluating"):

                ngram_ids = ngram_ids.to(device)
                ngram_mask = ngram_mask.to(device)

                with torch.no_grad():
                    logits = model(ngram_ids, ngram_mask)

                logits = logits.detach().cpu().numpy()
                flaw_labels = flaw_labels.to('cpu').numpy()
                label_id = label_id.to('cpu').numpy()
                token_ids = token_ids.to('cpu').numpy()
                masks = ngram_mask.to('cpu').numpy()

                with open(output_file, "a") as csv_file:

                    for i in range(len(label_id)):

                        correct_tokens = look_up_words(logits[i], masks[i],
                                                       vocab_list, p)
                        token_new = replace_token(token_ids[i], flaw_labels[i],
                                                  correct_tokens, i2w)
                        token_new = ' '.join(token_new)
                        label = str(label_id[i])
                        writer = csv.writer(csv_file, delimiter='\t')
                        writer.writerow([token_new, label])
Пример #5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The train file path")
    parser.add_argument("--eval_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The dev file path")
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        required=False,
                        help="The predict file path")
    parser.add_argument("--predict_result_file",
                        default='datas/result.csv',
                        type=str,
                        required=False,
                        help="The predict result file path")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=250,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--load_checkpoint",
                        default=False,
                        action='store_true',
                        help="Whether to run load checkpoint.")
    parser.add_argument("--num_labels",
                        default=1,
                        type=int,
                        help="mapping classify nums")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--epoches",
                        default=6,
                        type=int,
                        help="Total epoch numbers for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=6.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )

    args = parser.parse_args()
    vocab_path = os.path.join(args.bert_model, VOCAB_NAME)
    # bert_config = BertConfig.from_json_file(vocab_path)
    data_processor = DataProcessor()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # if args.do_train:
    #     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #         raise ValueError("Output directory ({}) already exists and is not empty.".format(
    #             args.output_dir))
    #     else:
    #         os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path,
                                           do_lower_case=args.do_lower_case)
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          num_labels=3)
    for k, v in model.state_dict().items():
        print(f'k = {k}, v.grad = {v.grad}')
    model.to(device)

    # model = torch.nn.DataParallel(model)

    def evaluating(model, eval_dataloader):
        model.eval()
        eval_loss = 0
        logits, labels = [], []
        for step, batch in enumerate(eval_dataloader):
            input_ids, input_mask, segment_ids, label_ids = [
                b.to(device) for b in batch
            ]
            with torch.no_grad():
                loss, logit = model(input_ids, segment_ids, input_mask,
                                    label_ids)
                loss = loss.mean()
            eval_loss = loss * args.gradient_accumulation_steps if step == 0 else eval_loss + loss * args.gradient_accumulation_steps
            logit = torch.argmax(logit, dim=-1)
            logits.extend(logit.tolist())
            labels.extend(label_ids.tolist())
        return (eval_loss.item() / step, logits, labels)

    def predicting(model, dataloader):
        model.eval()
        logits, example_ids = [], []
        for step, batch in enumerate(dataloader):
            if step % 100 == 0:
                print(f'当前预测进度: {step}/{len(dataloader)}')
            input_ids, input_mask, segment_ids, label_ids = [
                b.to(device) for b in batch
            ]
            with torch.no_grad():
                logit = model(input_ids, segment_ids, input_mask)
            logit = torch.argmax(logit, dim=-1)
            logits.extend(logit.tolist())
            example_ids.extend(label_ids.tolist())
        return logits, example_ids

    def eval_meric(model, data_loader):
        eval_loss, all_logits, all_labels = evaluating(model, data_loader)
        accuracy(all_labels, all_logits)
        logger.info(f'Average eval loss = {eval_loss}')
        return eval_loss

    def write_predict_file(model, data_loader, file_path):
        """
        写入预测文件: 格式:'五彩滨云-final.csv'
        """
        logits, ids = predicting(model, data_loader)
        assert len(ids) == len(logits)
        logger.info(
            f'zero nums {logits.count(0)}, one nums {logits.count(1)}, two nums {logits.count(2)}'
        )
        labels = [
            data_processor.eval_dict[id][1] for id, logit in zip(ids, logits)
        ]
        # if not args.do_eval:
        #     logits = [i - 1 for i in logits]
        #     data_df = pd.DataFrame({'id': ids, 'y': logits})
        # else:
        assert len(labels) == len(logits)
        # accuracy(labels, logits)
        passages = [
            data_processor.eval_dict[id][0] for id, logit in zip(ids, logits)
        ]
        autors = [
            data_processor.eval_dict[id][2] for id, logit in zip(ids, logits)
        ]
        like_counts = [
            data_processor.eval_dict[id][3] for id, logit in zip(ids, logits)
        ]
        times = [
            data_processor.eval_dict[id][4] for id, logit in zip(ids, logits)
        ]

        assert len(labels) == len(passages)
        match_array = np.array((logits)) == np.array(labels)
        match_list = match_array.tolist()
        data_df = pd.DataFrame({
            'id': ids,
            'pred': logits,
            'time': times,
            'match': '',
            'autors': autors,
            'like_counts': like_counts,
            'passage': passages
        })
        data_df.to_csv(file_path, index=None)

    eval_examples = data_processor.get_examples(args.eval_file,
                                                data_type='eval')

    eval_features = convert_examples_to_features(args, eval_examples,
                                                 args.max_seq_length,
                                                 tokenizer)
    eval_loader = ParaDataloader(eval_features)
    eval_loader = DataLoader(eval_loader,
                             shuffle=False,
                             batch_size=args.eval_batch_size)

    if 0:
        # 数据读取
        train_examples = data_processor.get_examples(args.train_file,
                                                     data_type='train')

        # 特征转换
        train_features = convert_examples_to_features(args, train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)

        num_train_steps = int(
            len(train_features) // args.train_batch_size //
            args.gradient_accumulation_steps * args.num_train_epochs)

        # 数据loader
        train_loader = ParaDataloader(train_features)

        # 数据并行loader输入格式
        train_loader = DataLoader(train_loader,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)

        model.zero_grad()
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)
        tr_loss = None
        for epoch in range(args.epoches):
            model.train()
            min_eval_loss = 10000
            for step, batch in enumerate(train_loader):
                input_ids, input_mask, segment_ids, label_ids = [
                    b.to(device) for b in batch
                ]

                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                loss = loss.mean()
                print(f'loss = {loss}')
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
                optimizer.step()
                optimizer.zero_grad()
                if step % 1000 == 1:
                    eval_loss = eval_meric(model, eval_loader)
                    if eval_loss < min_eval_loss:
                        save_checkpoint(model, epoch, args.output_dir)

    if args.do_predict:
        if args.load_checkpoint:
            state_dict = torch.load('output/pytorch_model-0004.bin')
            model.load_state_dict(state_dict)
        logger.info(f'Start to predict......')
        if args.do_eval:
            predict_examples = data_processor.get_eval_examples(args.eval_file)
        else:
            predict_examples = data_processor.get_predict_examples(
                args.predict_file)

        predict_features = convert_examples_to_features(
            args, predict_examples, args.max_seq_length, tokenizer)
        predict_loader = ParaDataloader(predict_features)
        predict_loader = DataLoader(predict_loader,
                                    shuffle=False,
                                    batch_size=args.eval_batch_size)
        write_predict_file(model, predict_loader, args.predict_result_file)
Пример #6
0
def train(train_batch_size,
          roberta_model,
          hidden_size=768,
          learning_rate=3e-5,
          warmup_proportion=0.1,
          seed=23):
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    print(device)
    n_gpu = torch.cuda.device_count()
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
    print('loading train features.')
    with open('preprocess/trainFeatures.pkl', 'rb') as f:
        trainFeatures = pickle.load(f)
    print('train features have been loaded.')
    nums = len(trainFeatures)
    print('训练集大小:', nums)
    num_train_optimization_steps = 4 * int(
        len(trainFeatures) / train_batch_size)
    model = basemodel(roberta_model, hidden_size)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in trainFeatures],
                                 dtype=torch.long)
    all_input_masks = torch.tensor([f.input_mask for f in trainFeatures],
                                   dtype=torch.long)
    all_start_positions = torch.tensor(
        [f.start_position for f in trainFeatures], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in trainFeatures],
                                     dtype=torch.long)
    all_answer_choices = torch.tensor([f.ans_choice for f in trainFeatures],
                                      dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_masks,
                               all_start_positions, all_end_positions,
                               all_answer_choices)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)
    model.train()
    print('training')
    for epoch in range(4):
        for step, batch in enumerate(
                tqdm(train_dataloader, desc='Iteration', disable=False)):
            if n_gpu == 1:
                batch = tuple(t.to(device) for t in batch)
            input_ids, input_masks, start_positions, end_positions, answer_choices = batch
            loss = model(input_ids, input_masks, start_positions,
                         end_positions, answer_choices)
            if n_gpu > 1:
                loss = loss.mean()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #print(loss)
            #model_to_save = model.module if hasattr(model, 'module') else model
            if (step % (int(nums / train_batch_size) // 3) == 0):
                os.mkdir('model' + '/' + str(epoch) + '_' + str(step))
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step),
                    'pytorch_model.bin')
                output_config_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step),
                    'config.json')
                output_m_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step), 'model.pt')
                torch.save(model_to_save.state_dict(), output_m_file)
                torch.save(model_to_save.roberta.state_dict(),
                           output_model_file)
                model_to_save.roberta.config.to_json_file(output_config_file)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--word_embedding_file",
                        default='./emb/wiki-news-300d-1M.vec',
                        type=str,
                        help="The input directory of word embeddings.")
    parser.add_argument("--index_path",
                        default='./emb/p_index.bin',
                        type=str,
                        help="The input directory of word embedding index.")
    parser.add_argument("--word_embedding_info",
                        default='./emb/vocab_info.txt',
                        type=str,
                        help="The input directory of word embedding info.")
    parser.add_argument("--data_file",
                        default='',
                        type=str,
                        help="The input directory of input data file.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--max_ngram_length",
                        default=16,
                        type=int,
                        help="The maximum total ngram sequence")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--embedding_size",
                        default=300,
                        type=int,
                        help="Total batch size for embeddings.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_eval_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of eval epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--single',
                        action='store_true',
                        help="Whether only evaluate a single epoch")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()
    # Comment the if else block for no CUDA
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu") # uncomment this for no GPU
    logger.info(
        "device: {} , distributed training: {}, 16-bits training: {}".format(
            device, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:  # Comment this to No GPU
        torch.cuda.manual_seed_all(args.seed)  # Comment this for No GPU

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    w2i, i2w, vocab_size = {}, {}, 1
    if args.do_train:

        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

        train_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_train(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_tokens = torch.tensor([f.token_ids for f in train_features],
                                  dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in train_features],
                                    dtype=torch.long)

    # load embeddings sa
    if args.do_train:
        logger.info("Loading word embeddings ... ")
        emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(
            args.word_embedding_file)
        if not os.path.exists(args.index_path):

            write_vocab_info(args.word_embedding_info, emb_vocab_size,
                             vocab_list)
            p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec,
                                               args.index_path)
        else:
            #emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info)
            p = load_embedding_index(args.index_path,
                                     emb_vocab_size,
                                     num_dim=args.embedding_size)
        #emb_dict, emb_vec, vocab_list, emb_vocab_size, p = None, None, None, None, None

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    model = BertForDiscriminator.from_pretrained(args.bert_model,
                                                 cache_dir=cache_dir,
                                                 num_labels=num_labels)
    model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:  # Comment this for NO GPU
        model = torch.nn.DataParallel(model)  # Comment this for NO GPU

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 1
    tr_loss = 0
    if args.do_train:

        train_data = TensorDataset(all_tokens, all_label_id)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ind in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            flaw_eval_f1 = []
            flaw_eval_recall = []
            flaw_eval_precision = []
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                tokens, _ = batch  #, label_id, ngram_ids, ngram_labels, ngram_masks

                # module1: learn a discriminator
                tokens = tokens.to('cpu').numpy()
                #print("PRINTING TOKENS!!!!!!!!! ", len(tokens[0]))
                train_features = convert_examples_to_features_flaw(
                    tokens, args.max_seq_length, args.max_ngram_length,
                    tokenizer, i2w, emb_dict, p, vocab_list)

                flaw_mask = torch.tensor([f.flaw_mask for f in train_features],
                                         dtype=torch.long).to(
                                             device)  # [1, 1, 1, 1, 0,0,0,0]
                flaw_ids = torch.tensor([f.flaw_ids for f in train_features],
                                        dtype=torch.long).to(
                                            device)  # [12,25,37,54,0,0,0,0]
                flaw_labels = torch.tensor(
                    [f.flaw_labels for f in train_features],
                    dtype=torch.long).to(device)  # [0, 1, 1, 1, 0,0,0,0]

                loss, logits = model(flaw_ids, flaw_mask, flaw_labels)
                logits = logits.detach().cpu().numpy()

                if n_gpu > 1:  # Comment this for NO GPU
                    loss = loss.mean()  # Comment this for NO GPU

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()

                nb_tr_examples += flaw_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                # eval during training
                flaw_labels = flaw_labels.to('cpu').numpy()

                flaw_tmp_eval_f1, flaw_tmp_eval_recall, flaw_tmp_eval_precision = f1_3d(
                    logits, flaw_labels)
                flaw_eval_f1.append(flaw_tmp_eval_f1)
                flaw_eval_recall.append(flaw_tmp_eval_recall)
                flaw_eval_precision.append(flaw_tmp_eval_precision)

                nb_eval_examples += flaw_ids.size(0)
                nb_eval_steps += 1

            flaw_f1 = sum(flaw_eval_f1) / len(flaw_eval_f1)
            flaw_recall = sum(flaw_eval_recall) / len(flaw_eval_recall)
            flaw_precision = sum(flaw_eval_precision) / len(
                flaw_eval_precision)
            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'flaw_f1': flaw_f1,
                "flaw_recall": flaw_recall,
                "flaw_precision": flaw_precision,
                'loss': loss,
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "train_results.txt")
            with open(output_eval_file, "a") as writer:
                #logger.info("***** Training results *****")
                writer.write("epoch" + str(ind) + '\n')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('\n')

            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir,
                                             "epoch" + str(ind) + WEIGHTS_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

        os.rename(
            output_model_file,
            os.path.join(args.output_dir, "disc_trained_" + WEIGHTS_NAME))
        current_path = os.path.join(args.output_dir,
                                    "disc_trained_" + WEIGHTS_NAME)
        new_path = os.path.join('./models', "disc_trained_" + WEIGHTS_NAME)
        new_path_config = os.path.join('./models' + CONFIG_NAME)
        shutil.move(current_path, new_path)
        shutil.move(output_config_file, new_path_config)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank()
                         == 0):  # for trouble-shooting

        eval_examples = processor.get_disc_dev_examples(args.data_file)
        eval_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_eval(
            eval_examples, label_list, args.max_seq_length, tokenizer, w2i,
            i2w, vocab_size)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_token_ids = torch.tensor([f.token_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features],
                                       dtype=torch.long)
        all_flaw_ids = torch.tensor([f.flaw_ids for f in eval_features],
                                    dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in eval_features],
                                    dtype=torch.long)
        all_chunks = torch.tensor([f.chunks for f in eval_features],
                                  dtype=torch.long)
        #print("flaw ids in eval_features: ", all_flaw_ids)

        eval_data = TensorDataset(all_token_ids, all_input_ids, all_input_mask,
                                  all_flaw_ids, all_flaw_labels, all_label_id,
                                  all_chunks)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Load a trained model and config that you have fine-tuned
        if args.single:
            eval_range = trange(int(args.num_eval_epochs),
                                int(args.num_eval_epochs + 1),
                                desc="Epoch")
        else:
            eval_range = trange(int(args.num_eval_epochs), desc="Epoch")

        attack_type = 'rand'
        for epoch in eval_range:

            output_file = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" +
                attack_type + ".tsv")
            with open(output_file, "w") as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                writer.writerow(["sentence", "label", "ids"])

            #output_model_file = os.path.join(args.output_dir, "epoch"+str(epoch)+WEIGHTS_NAME)
            output_model_file = os.path.join(args.output_dir,
                                             "disc_trained_" + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            #print("output_model_file: ", output_model_file)
            config = BertConfig(output_config_file)
            model = BertForDiscriminator(config, num_labels=num_labels)
            model.load_state_dict(torch.load(output_model_file))

            model.to(device)
            model.eval()
            predictions, truths = [], []
            eval_loss, nb_eval_steps, nb_eval_examples = 0, 0, 0
            eval_accuracy = 0

            for token_ids, input_ids, input_mask, flaw_ids, flaw_labels, label_id, chunks in tqdm(
                    eval_dataloader, desc="Evaluating"):

                token_ids = token_ids.to(device)
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                flaw_labels = flaw_labels.to(device)
                flaw_ids = flaw_ids.to(device)

                #print("flaw ids in eval_dataloader: ", flaw_ids)

                with torch.no_grad():
                    tmp_eval_loss, s = model(input_ids, input_mask,
                                             flaw_labels)

                    #                     print("tmp_eval_loss: ",tmp_eval_loss)
                    #                     print("s: ",s)

                    logits = model(input_ids, input_mask)

                    print("len of logits: ", len(logits))
                    print("shape of logits: ", logits.size())
                    print("type of logits: ", type(logits))
                    print("type of logits: ", logits)

                    flaw_logits = torch.argmax(logits, dim=2)

                    print("Type of flaw_logits: ", type(flaw_logits))
                    print("shape of flaw_logits: ", flaw_logits.size())
                    print("Length of flaw_logits: ", len(flaw_logits))
                    print("flaw_logits: ", flaw_logits)

                logits = logits.detach().cpu().numpy()
                flaw_logits = flaw_logits.detach().cpu().numpy()
                flaw_ids = flaw_ids.to('cpu').numpy()
                label_id = label_id.to('cpu').numpy()
                chunks = chunks.to('cpu').numpy()
                token_ids = token_ids.to('cpu').numpy()

                flaw_logits = logit_converter(
                    flaw_logits, chunks)  # each word only has one '1'

                print("Type of flaw_logits logit_converter: ",
                      type(flaw_logits))
                #print("shape of flaw_logits logit_converter : ",flaw_logits.size())
                print("Length of flaw_logits logit_converter : ",
                      len(flaw_logits))
                print("flaw_logits logit_converter : ", flaw_logits)

                true_logits = []

                #print("length of flaw_ids: ",len(flaw_ids))

                for i in range(len(flaw_ids)):
                    tmp = [0] * len(flaw_logits[i])

                    #print("tmp: ",tmp) # ne line
                    #print("printing i:",i)
                    #print("len of tmp: ",len(tmp))
                    #print("length of flaw_ids of i : ",len(flaw_ids[i]))
                    #print("flaw_ids[i]: ",flaw_ids[i])

                    for j in range(len(flaw_ids[0])):
                        #print("flaw_ids[i][j] : ",flaw_ids[i][j])
                        #print("tmp value: ", tmp)
                        #print("tmp len: ", len(tmp))
                        if flaw_ids[i][j] == 0: break
                        if flaw_ids[i][j] >= len(tmp): continue
                        tmp[flaw_ids[i][j]] = 1

                    true_logits.append(tmp)
                    #print('true_logits: ', true_logits)

                tmp_eval_accuracy = accuracy_2d(flaw_logits, true_logits)
                eval_accuracy += tmp_eval_accuracy

                predictions += true_logits  # Original
                truths += flaw_logits  # Original
                #predictions += flaw_logits # for trouble-shooting
                #truths += true_logits # for trouble-shooting
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

                with open(output_file, "a") as csv_file:
                    for i in range(len(label_id)):
                        #print("i in write output file:",i)
                        token = ' '.join(
                            [i2w[x] for x in token_ids[i] if x != 0])
                        flaw_logit = flaw_logits[i]
                        #print("flaw_logit in write output file: ",flaw_logit)
                        label = str(label_id[i])
                        logit = ','.join([
                            str(i) for i, x in enumerate(flaw_logit) if x == 1
                        ])  # for trouble-shooting
                        logit = '-1' if logit == '' else logit  # for trouble-shooting
                        writer = csv.writer(csv_file, delimiter='\t')
                        writer.writerow([token, label, logit])

                # Renaming and moving the file for Embedding Estimator

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_steps
            eval_f1_score, eval_recall_score, eval_precision_score = f1_2d(
                truths, predictions)
            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'eval_loss': eval_loss,
                'eval_f1': eval_f1_score,
                'eval_recall': eval_recall_score,
                'eval_precision': eval_precision_score,
                'eval_acc': eval_accuracy
            }

            output_eval_file = os.path.join(
                args.output_dir,
                "disc_eval_results_" + attack_type + "_attacks.txt")
            with open(output_eval_file, "a") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            #attack_type='drop'
            new_path = os.path.join(
                args.data_dir, "disc_eval_outputs_" + attack_type + ".tsv")
            current_path = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" +
                attack_type + ".tsv")
            os.rename(current_path, new_path)
Пример #8
0
class MIL:
    def __init__(self, args):
        self.args = args
        self.raw_data = Raw_dataset(args.flickerfile, args.label_list)
        self.transformer = get_transformer(network=args.network)
        self.evaluator = evaluator(self.raw_data.label_list)
        shuffle = False
        if args.mode != 'test':
            shuffle = True

        self.model = ActionClassifier(self.raw_data.get_num_of_labels(),
                                      head=args.head,
                                      pos_dim=4,
                                      hidden_size=512,
                                      is_tsv_feat=False,
                                      in_channels=args.num_boxes)
        self.epoch = args.epochs
        self.save_epoch = args.save_epoch
        self.lr = args.learning_rate

        if args.mode == 'train' or args.mode == 'small_data':
            self.data_set = MIL_dataset(self.raw_data,
                                        self.transformer,
                                        img_path=args.img_root_dir,
                                        tsv_path=args.tsv_path,
                                        mode=args.mode,
                                        use_tsv=False,
                                        num_boxes=args.num_boxes)
            self.data_loader = DataLoader(self.data_set,
                                          batch_size=args.batch_size,
                                          shuffle=shuffle,
                                          num_workers=args.num_workers)
            #params = list(self.decoder.parameters())
            #params.extend(list(self.encoder.parameters()))

            batch_per_epoch = len(self.data_loader)
            t_total = int(batch_per_epoch * args.epochs)

        if args.mode == 'dev':
            self.eval_set = MIL_dataset(self.raw_data,
                                        self.transformer,
                                        img_path=args.img_root_dir,
                                        tsv_path=args.tsv_path,
                                        mode=args.mode,
                                        use_tsv=False,
                                        num_boxes=args.num_boxes)
            self.eval_loader = DataLoader(self.eval_set,
                                          batch_size=args.batch_size,
                                          shuffle=shuffle,
                                          num_workers=args.num_workers)

        self.st_epoch = 0
        if (args.load != None):
            fname = os.path.join(args.model_dir, args.load)
            self.st_epoch = self.load_model(fname)

        self.model.cuda()
        if args.multiGPU:
            self.model = nn.DataParallel(self.model)

        self.criterion = Mixture_loss(args.head)
        #self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.05, momentum=0.9, weight_decay = 0.1)

        if args.mode == 'train':
            batch_per_epoch = len(self.data_loader)
            print('batch per epoch ={}'.format(batch_per_epoch))
            t_total = int(batch_per_epoch * args.epochs)
            print('total iterations== {} ; warmup start = {}'.format(
                t_total, t_total * args.wstep))
            self.optimizer = BertAdam(
                list(self.model.parameters()),
                lr=args.learning_rate,
                warmup=args.wstep,
                t_total=t_total)  #changing warmup from 0.1 to 0.3

    def train(self):
        print('training started')
        self.model.train()
        for epoch in range(self.epoch):
            tr_loss = 0
            nb_tr_steps = 0
            em_loss_t = 0
            cls_loss_t = 0
            for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                    self.data_loader):
                self.optimizer.zero_grad()
                imgs, subimgs, boxes, interaction_pattern, label_hot_vec = imgs.cuda(
                ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda(
                ), label_hot_vec.cuda()
                g_x, p_yi_x = self.model(imgs, subimgs, boxes,
                                         interaction_pattern, label_hot_vec)

                loss, em_loss, class_loss = self.criterion(
                    g_x, p_yi_x,
                    label_hot_vec)  #(h_x, g_x, p_y_x, p_yi_x,label_hot_vec)

                if self.args.multiGPU:
                    loss = loss.mean()

                tr_loss += loss.item()
                em_loss_t += em_loss
                cls_loss_t += class_loss

                nb_tr_steps += 1
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), 1.)
                #nn.utils.clip_grad_norm_(self.decoder.parameters(), 1.)
                self.optimizer.step()

            print(
                "Train loss@epoch {}: total:{} emloss:{}, cls_loss:{}".format(
                    self.st_epoch + epoch + 1, tr_loss / nb_tr_steps,
                    em_loss_t / nb_tr_steps, cls_loss_t / nb_tr_steps))
            if epoch == self.epoch - 1 or (epoch + 1) % self.save_epoch == 0:
                filename = 'pascal_voc' + str(self.st_epoch + epoch +
                                              1) + '.model'
                filename = os.path.join(args.model_dir, filename)
                self.save_model(filename, self.st_epoch + epoch + 1)

    def evaluate(self, thresold=0.5):
        print('evaluation started')
        self.model.eval()
        res = []
        gold = []
        for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                self.eval_loader):
            imgs, subimgs, boxes, interaction_pattern = imgs.cuda(
            ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda()
            g_x, p_yi_x = self.model(imgs, subimgs, boxes, interaction_pattern)
            g_x = g_x.unsqueeze(dim=-1)
            class_prob = torch.bernoulli(p_yi_x) * g_x
            class_prob = torch.sum(class_prob, dim=1) - thresold
            class_prob = class_prob.cpu()
            target = [
                torch.nonzero(t).squeeze(-1).numpy() for t in label_hot_vec
            ]
            pred = [(t > 0).nonzero().squeeze(-1).numpy() for t in class_prob]

            for gs in target:
                gold.append(gs)

            for pd in pred:
                res.append(pd)

        self.evaluator.evaluate(res, gold, self.args.dump)

    def evaluate2(self, thresold=0.5):
        print('evaluation started')
        self.model.eval()
        res = []
        gold = []
        for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                self.eval_loader):
            imgs, subimgs, boxes, interaction_pattern = imgs.cuda(
            ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda()
            g_x, p_yi_x = self.model(imgs, subimgs, boxes, interaction_pattern)
            g_x = g_x.unsqueeze(dim=-1)
            #class_prob = torch.bernoulli(p_yi_x) * g_x
            #class_prob = torch.sum(class_prob,dim=1) - thresold
            #class_prob = class_prob.cpu()
            target = [
                torch.nonzero(t).squeeze(-1).numpy() for t in label_hot_vec
            ]
            pred = (p_yi_x >= thresold).float() * 1
            pred = torch.prod(pred, dim=1)
            #pred = torch.sum(pred, dim =1)

            pred1 = [(t > 0).nonzero().squeeze(-1).numpy() for t in pred]
            #pred1 = [(t>9).nonzero().squeeze(-1).numpy() for t in pred] #atleast 9 distribution among 16 says yes
            for gs in target:
                gold.append(gs)

            for pd in pred1:
                res.append(pd)

        self.evaluator.evaluate(res, gold, self.args.dump)

    def save_model(self, name, epoch):
        #epoch = self.epoch
        lr = self.lr
        check_point = {}
        check_point['model'] = self.model.state_dict()
        #check_point['decoder'] = self.decoder.state_dict()
        check_point['epoch'] = epoch
        check_point['lr'] = lr
        check_point['optimizer'] = None
        torch.save(check_point, name)
        print('model saved at {}'.format(name))

    def load_model(self, path):
        print("Load model from %s" % path)
        check_point = torch.load(path)
        model_dict = check_point['model']
        #decoder_dict = check_point['decoder']

        self.model.load_state_dict(model_dict, strict=False)
        #self.decoder.load_state_dict(decoder_dict, strict=False)
        if check_point['optimizer'] != None:
            self.optimizer = optimizer

        return check_point['epoch']
Пример #9
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels('absa')
    model = ABSABert.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))
    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    model.cuda()

    epoch_dataset = PregeneratedDataset(epoch=0,
                                        training_path=args.data_dir,
                                        tokenizer=tokenizer,
                                        num_data_epochs=1)
    unlabel_train_sampler = RandomSampler(epoch_dataset)
    unlabel_train_dataloader = DataLoader(epoch_dataset,
                                          sampler=unlabel_train_sampler,
                                          batch_size=args.train_batch_size)
    unlabel_iter = iter(unlabel_train_dataloader)
    train_steps = len(unlabel_train_dataloader)

    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = train_steps * args.num_train_epochs
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()
    total_loss = 0
    for e_ in range(args.num_train_epochs):
        if e_ > 0:
            epoch_dataset = PregeneratedDataset(epoch=e_,
                                                training_path=args.data_dir,
                                                tokenizer=tokenizer,
                                                num_data_epochs=1)
            unlabel_train_sampler = RandomSampler(epoch_dataset)
            unlabel_train_dataloader = DataLoader(
                epoch_dataset,
                sampler=unlabel_train_sampler,
                batch_size=args.train_batch_size)
            unlabel_iter = iter(unlabel_train_dataloader)
            train_steps = len(unlabel_train_dataloader)
            logger.info('unlabel data number:{} steps:{}'.format(
                len(epoch_dataset), train_steps))

        for step in range(train_steps):
            batch = unlabel_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, tag_ids, head_tokens_index, rel_label_ids, input_mask, lm_label_ids, tag_label_ids, _ = batch
            loss = model(input_ids,
                         input_tags=tag_ids,
                         head_tokens_index=head_tokens_index,
                         dep_relation_label=rel_label_ids,
                         masked_tag_labels=tag_label_ids,
                         attention_mask=input_mask)

            loss.backward()
            total_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()

            global_step += 1
            if global_step % 100 == 0:
                logger.info('in step {} loss is: {}'.format(
                    global_step, total_loss / global_step))
            # >>>> perform validation at the end of each epoch .
        os.makedirs(args.output_dir + '/epoch' + str(e_), exist_ok=True)
        torch.save(
            model.state_dict(),
            os.path.join(args.output_dir + '/epoch' + str(e_), "model.pt"))
Пример #10
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels(args.task_type)
    model = BertForTokenClassification.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))

    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir,
                                                  args.task_type)
    num_train_steps = int(
        math.ceil(len(train_examples) /
                  args.train_batch_size)) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    # all_tag_ids = torch.tensor([f.tag_id for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir,
                                                    args.task_type)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer)
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []

    model.cuda()

    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps  # num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()

    train_steps = len(train_dataloader)
    for e_ in range(args.num_train_epochs):
        train_iter = iter(train_dataloader)
        for step in range(train_steps):
            batch = train_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch
            loss = model(input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss = model(input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask,
                                 labels=label_ids)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f", valid_loss)
                valid_losses.append(valid_loss)

            if valid_loss < best_valid_loss:
                torch.save(model, os.path.join(args.output_dir, "model.pt"))
                best_valid_loss = valid_loss
            model.train()

    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model, os.path.join(args.output_dir, "model.pt"))
Пример #11
0
def train(args):

    processor = data_utils.AscProcessor()
    label_list = processor.get_labels()
    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer, "asc")
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    #>>>>> validation
    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer, "asc")
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []
    #<<<<< end of validation declaration

    model = BertForABSA.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))
    model.cuda()
    # Prepare optimizer
    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()

    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch
            optimizer.zero_grad()
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            loss.backward()

            lr_this_step = args.learning_rate * warmup_linear(
                global_step / t_total, args.warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            global_step += 1
        print("training loss: ", loss.item(), epoch + 1)
        #>>>> perform validation at the end of each epoch.
        new_dirs = os.path.join(args.output_dir, str(epoch + 1))
        os.mkdir(new_dirs)
        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss = model(input_ids, segment_ids, input_mask, label_ids)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f, epoch: %d", valid_loss,
                            epoch + 1)
                valid_losses.append(valid_loss)
                torch.save(model, os.path.join(new_dirs, "model.pt"))
                test(args, new_dirs, dev_as_test=True)
                if epoch == args.num_train_epochs - 1:
                    torch.save(model, os.path.join(args.output_dir,
                                                   "model.pt"))
                    test(args, args.output_dir, dev_as_test=False)
                os.remove(os.path.join(new_dirs, "model.pt"))
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            model.train()
    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model, os.path.join(args.output_dir, "model.pt"))
Пример #12
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels(args.task_type)

    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir,
                                                  args.task_type)
    num_train_steps = int(
        math.ceil(len(train_examples) /
                  args.train_batch_size)) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    domain_dataset = PregeneratedDataset(epoch=0,
                                         training_path=args.domain_dataset,
                                         tokenizer=tokenizer,
                                         num_data_epochs=1)

    domain_train_sampler = RandomSampler(domain_dataset)
    domain_train_dataloader = DataLoader(domain_dataset,
                                         sampler=domain_train_sampler,
                                         batch_size=16)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # >>>>> validation
    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir,
                                                    args.task_type)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer)
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        # valid_all_tag_ids = torch.tensor([f.tag_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []

    # <<<<< end of validation declaration
    model = ABSABert.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))

    if args.features_model != 'none':
        state_dict = torch.load(args.features_model)
        del state_dict['classifier.weight']
        del state_dict['classifier.bias']
        model.load_state_dict(state_dict, strict=False)
        logger.info('load fine-tuned model from : {}'.format(
            args.features_model))

    model.cuda()

    flag = True
    if flag:
        # bert-base
        shared_param_optimizer = [(k, v)
                                  for k, v in model.bert.named_parameters()
                                  if v.requires_grad == True]
        shared_param_optimizer = [
            n for n in shared_param_optimizer if 'pooler' not in n[0]
        ]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        shared_optimizer_grouped_parameters = [{
            'params': [
                p for n, p in shared_param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in shared_param_optimizer
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        t_total = num_train_steps  # num_train_steps
        supervised_param_optimizer = model.classifier.parameters()

        domain_classifier_param_optimizer = model.domain_cls.parameters()

        shared_optimizer = BertAdam(shared_optimizer_grouped_parameters,
                                    lr=args.learning_rate,
                                    warmup=args.warmup_proportion,
                                    t_total=t_total)

        supervised_optimizer = BertAdam(supervised_param_optimizer,
                                        lr=args.learning_rate,
                                        warmup=args.warmup_proportion,
                                        t_total=t_total)

        domain_optimizer = BertAdam(domain_classifier_param_optimizer,
                                    lr=3e-5,
                                    warmup=args.warmup_proportion,
                                    t_total=-1)
    else:
        param_optimizer = [(k, v) for k, v in model.named_parameters()
                           if v.requires_grad == True]
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps  # num_train_steps
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    model.train()

    train_steps = len(train_dataloader)
    total_domain_loss = 0
    for e_ in range(args.num_train_epochs):
        train_iter = iter(train_dataloader)
        domain_iter = iter(domain_train_dataloader)
        for step in range(train_steps):
            batch = train_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch  # all_input_ids, all_segment_ids, all_input_mask, all_label_ids, all_tag_ids
            loss, _ = model(input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)

            loss.backward()
            if flag:
                shared_optimizer.step()
                shared_optimizer.zero_grad()
                supervised_optimizer.step()
                supervised_optimizer.zero_grad()
            else:
                optimizer.step()
                optimizer.zero_grad()

            dirt_n = 1  # 1 or 2
            for _ in range(dirt_n):
                try:
                    batch = domain_iter.next()
                except:
                    domain_iter = iter(domain_train_dataloader)
                    batch = domain_iter.next()
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, domain_labels = batch[0], batch[
                    4], batch[-1]
                d_loss = model(input_ids,
                               attention_mask=input_mask,
                               domain_label=domain_labels)
                d_loss.backward()
                total_domain_loss += d_loss.item()

                domain_optimizer.step()
                domain_optimizer.zero_grad()
                shared_optimizer.zero_grad(
                )  # make sure to clear the gradients of encoder.

            if step % 50 == 0:
                logger.info('in step {} domain loss: {}'.format(
                    dirt_n * (e_ * train_steps + step + 1), total_domain_loss /
                    (dirt_n * (e_ * train_steps + step + 1))))

            global_step += 1
            # >>>> perform validation at the end of each epoch .

        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss, _ = model(input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                    loss = torch.mean(loss)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f", valid_loss)
                valid_losses.append(valid_loss)

            if valid_loss < best_valid_loss:
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "model.pt"))
                best_valid_loss = valid_loss
            model.train()

    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model.state_dict(), os.path.join(args.output_dir,
                                                    "model.pt"))