示例#1
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file", default=None, type=str)
    parser.add_argument("--eval_file", default=None, type=str)
    parser.add_argument("--test_file", default=None, type=str)
    parser.add_argument("--inference_file", default=None, type=str)
    parser.add_argument("--model_name_or_path", default=None, type=str)
    parser.add_argument("--output_dir", default=None, type=str)

    ## other parameters
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    
    parser.add_argument("--max_seq_length", default=256, type=int)
    parser.add_argument("--do_train", default=False, type=boolean_string)
    parser.add_argument("--do_eval", default=False, type=boolean_string)
    parser.add_argument("--do_test", default=False, type=boolean_string)
    parser.add_argument("--resume", default=False, type=boolean_string)
    parser.add_argument("--do_inference", default=False, type=boolean_string)
    parser.add_argument("--train_batch_size", default=8, type=int)
    parser.add_argument("--eval_batch_size", default=8, type=int)
    parser.add_argument("--learning_rate", default=3e-5, type=float)
    parser.add_argument("--num_train_epochs", default=10, type=float)
    parser.add_argument("--warmup_proprotion", default=0.1, type=float)
    parser.add_argument("--use_weight", default=1, type=int)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--seed", type=int, default=2019)
    parser.add_argument("--fp16", default=False)
    parser.add_argument("--loss_scale", type=float, default=0)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument("--warmup_steps", default=0, type=int)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--max_steps", default=-1, type=int)
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--logging_steps", default=500, type=int)
    parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir")

    parser.add_argument("--need_birnn", default=False, type=boolean_string)
    parser.add_argument("--rnn_dim", default=128, type=int)

    args = parser.parse_args()

    device = torch.device("cuda")
    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_
    args.device = device
    n_gpu = torch.cuda.device_count()

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO)

    logger.info(f"device: {device} n_gpu: {n_gpu}")

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H')
    # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie'
    # if not os.path.exists(tmp_dir):
    #     os.makedirs(tmp_dir)
    # args.output_dir = tmp_dir
    if args.clean and args.do_train and not args.resume:
        # logger.info("清理")
        if os.path.exists(args.output_dir):
            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    print(c_path)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                        os.rmdir(c_path)
                    else:
                        os.remove(c_path)
            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
    
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.resume:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if not os.path.exists(os.path.join(args.output_dir, "eval")):
        os.makedirs(os.path.join(args.output_dir, "eval"))
    
    writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear")

    processor = NerProcessor()
    label_list = processor.get_labels(args)
    num_labels = len(label_list)
    args.label_list = label_list

    if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")):
        with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f:
            label2id = pickle.load(f)
    else:
        label2id = {l:i for i,l in enumerate(label_list)}
        with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f:
            pickle.dump(label2id, f)      
    
    id2label = {value:key for key,value in label2id.items()} 

    # Prepare optimizer and schedule (linear warmup and decay)

    if args.do_train:
        if args.resume:
            tokenizer = BertTokenizer.from_pretrained(args.output_dir, 
                        do_lower_case=args.do_lower_case)
            config = BertConfig.from_pretrained(args.output_dir, 
                    num_labels=num_labels)
            model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, config=config, 
                    need_birnn=args.need_birnn, rnn_dim=args.rnn_dim)            
        else:
            tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 
                        do_lower_case=args.do_lower_case)
            config = BertConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 
                    num_labels=num_labels)
            model = BERT_BiLSTM_CRF.from_pretrained(args.cache_dir if args.cache_dir else args.model_name_or_path, config=config, 
                    need_birnn=args.need_birnn, rnn_dim=args.rnn_dim)

        model.to(device)
        
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train")
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        if args.do_eval:
            eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval")
      
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps =t_total)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info("  Total optimization steps = %d", t_total)

        model.train()
        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        best_f1 = 0.0
        for ep in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                outputs = model(input_ids, label_ids, segment_ids, input_mask)
                loss = outputs

                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1
                    
                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        tr_loss_avg = (tr_loss-logging_loss)/args.logging_steps
                        writer.add_scalar("Train/loss", tr_loss_avg, global_step)
                        logging_loss = tr_loss
            
            if args.do_eval:
                all_ori_tokens_eval = [f.ori_tokens for f in eval_features]
                overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval)
                
                # add eval result to tensorboard
                f1_score = overall.fscore
                writer.add_scalar("Eval/precision", overall.prec, ep)
                writer.add_scalar("Eval/recall", overall.rec, ep)
                writer.add_scalar("Eval/f1_score", overall.fscore, ep)
                
                # save the best performs model
                if f1_score >= best_f1:
                    logger.info(f"----------the best f1 is {f1_score}---------")
                    best_f1 = f1_score
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

                    # Good practice: save your training arguments together with the trained model
                    torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

            # logger.info(f'epoch {ep}, train loss: {tr_loss}')
        # writer.add_graph(model)
        writer.close()

        # model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        # model_to_save.save_pretrained(args.output_dir)
        # tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        # torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))


    if args.do_test:
        # model = BertForTokenClassification.from_pretrained(args.output_dir)
        # model.to(device)
        label_map = {i : label for i, label in enumerate(label_list)}

        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        #args = torch.load(os.path.join(args.output_dir, 'training_args.bin'))
        model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim)
        model.to(device)

        test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="test")

        logger.info("***** Running test *****")
        logger.info(f" Num examples = {len(test_examples)}")
        logger.info(f" Batch size = {args.eval_batch_size}")

        all_ori_tokens = [f.ori_tokens for f in test_features]
        all_ori_labels = [e.label.split(" ") for e in test_examples]
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
        model.eval()

        pred_labels = []
        
        for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")):
            
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model.predict(input_ids, segment_ids, input_mask)
            # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            # logits = logits.detach().cpu().numpy()

            for l in logits:

                pred_label = []
                for idx in l:
                    pred_label.append(id2label[idx])
                pred_labels.append(pred_label)

        assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels)
        print(len(pred_labels))
        with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f:
            for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels):
                for ot,ol,pl in zip(ori_tokens, ori_labels, prel):
                    if ot in ["[CLS]", "[SEP]"]:
                        continue
                    else:
                        f.write(f"{ot} {ol} {pl}\n")
                f.write("\n")
    if args.do_inference:
        label_map = {i : label for i, label in enumerate(label_list)}

        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        #args = torch.load(os.path.join(args.output_dir, 'training_args.bin'))
        model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim)
        model.to(device)  
        test_examples, test_features, test_data = get_Pred_Dataset(args, processor, tokenizer, mode="inference")

        logger.info("***** Running test *****")
        logger.info(f" Num examples = {len(test_examples)}")
        logger.info(f" Batch size = {args.eval_batch_size}")

        all_ori_tokens = [f.ori_tokens for f in test_features]
        all_ori_labels = [e.label.split(" ") for e in test_examples]
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
        model.eval()

        pred_labels = []
        
        for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(test_dataloader, desc="Predicting")):
            
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model.predict(input_ids, segment_ids, input_mask)
            # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            # logits = logits.detach().cpu().numpy()

            for l in logits:

                pred_label = []
                for idx in l:
                    pred_label.append(id2label[idx])
                pred_labels.append(pred_label)

        assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels)
        print(len(pred_labels))
        with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f:
            for ori_tokens, ori_labels,prel in zip(all_ori_tokens, all_ori_labels, pred_labels):
                for ot,ol,pl in zip(ori_tokens, ori_labels, prel):
                    if ot in ["[CLS]", "[SEP]"]:
                        continue
                    else:
                        f.write(f"{ot} {ol} {pl}\n")
                f.write("\n")        
示例#2
0
def main():
    '''
    Parameters
    '''
    parser = argparse.ArgumentParser()

    parser.add_argument("--train_file", default="./data/train.pkl", type=str)
    parser.add_argument("--eval_file", default="./data/dev.pkl", type=str)
    parser.add_argument("--test_file", default="./data/test.pkl", type=str)

    parser.add_argument("--model_type", default="roberta-large", type=str)
    parser.add_argument("--model_name_or_path",
                        default="../language_model/roberta-large/",
                        type=str)
    parser.add_argument("--do_lower_case", default=True, type=boolean_string)
    parser.add_argument("--output_dir",
                        default="./state_models/roberta-large/",
                        type=str)

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument("--do_train", default=False, type=boolean_string)
    parser.add_argument("--do_eval", default=False, type=boolean_string)
    parser.add_argument("--do_test", default=True, type=boolean_string)

    parser.add_argument("--seed", type=int, default=2020)
    parser.add_argument("--max_seq_length", default=64, type=int)
    parser.add_argument("--train_batch_size", default=8, type=int)
    parser.add_argument("--eval_batch_size", default=8, type=int)
    parser.add_argument("--num_train_epochs", default=5, type=float)
    parser.add_argument("--no_cuda",
                        default=False,
                        type=boolean_string,
                        help="Whether not to use CUDA when available")

    parser.add_argument("--save_check_point", default=4600, type=int)
    parser.add_argument("--eval_steps", default=100, type=int)
    parser.add_argument("--skip_eval_rate", default=0.30, type=float)
    parser.add_argument("--logging_steps", default=200, type=int)
    parser.add_argument("--warmup_steps", default=0, type=int)
    parser.add_argument("--warmup_proprotion", default=0.1, type=float)
    parser.add_argument("--max_steps", default=-1, type=int)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)

    parser.add_argument("--learning_rate", default=1e-5, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--overwrite_output_dir",
                        default=True,
                        type=boolean_string)
    args = parser.parse_args()

    # Setup CUDA, GPU
    if args.no_cuda:
        device = torch.device("cpu")
        args.n_gpu = 0
    else:
        device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
    args.device = device
    print("device: {0}, n_gpu: {1}".format(device, args.n_gpu))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # Setup path to save model
    if args.overwrite_output_dir and args.do_train:
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    print("Clean the output path: {}".format(c_path))
                    if os.path.isdir(c_path):
                        del_file(c_path)
                        os.rmdir(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Setup seed
    set_seed(args)

    print("Training/evaluation parameters %s", args)
    '''
    Data
    '''
    processor = ATSAProcessor()
    label_list = processor.get_labels()
    args.num_labels = len(label_list)
    args.label_list = label_list
    '''
    Train
    '''
    if args.do_train:
        # --------------------  loading model --------------------

        config = AutoConfig.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=args.num_labels,
            output_hidden_states=True)
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        model = RobertaForSequenceClassification_MERGE.from_pretrained(
            args.model_name_or_path, config=config)
        model.to(device)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # -------------------- loading data --------------------

        train_examples, train_features, train_data = get_Dataset(args,
                                                                 processor,
                                                                 tokenizer,
                                                                 mode="train")
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        if args.do_eval:
            eval_examples, eval_features, eval_data = get_Dataset(args,
                                                                  processor,
                                                                  tokenizer,
                                                                  mode="eval")

        # t_total:模型参数更新次数,模型每个batch更新一次
        # len(train_dataloader):训练集数据总batch数
        # len(train-dataloader) // args.gradient_accumulation_steps:一个epoch模型参数更新次数
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        # -------------------- optimizer & schedule (linear warmup and decay) --------------------

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

        # -------------------- Train --------------------

        print("\n******************** Running Train ********************")
        print("  Num examples = {}".format(len(train_data)))
        print("  Num Epochs = {}".format(args.num_train_epochs))
        print("  Total optimization steps = {}".format(t_total))

        global_step = 0
        best_f1 = 0.0
        tr_loss, logging_loss = 0.0, 0.0
        model.train()
        model.zero_grad()
        for ep in range(int(args.num_train_epochs)):
            for step, batch in enumerate(train_dataloader):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': batch[2],
                    'labels': batch[3]
                }
                if "roberta" in args.model_type:
                    inputs["token_type_ids"] = None
                outputs = model(**inputs)
                loss = outputs[
                    0]  # model outputs are always tuple in transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tr_loss_avg = (
                        tr_loss - logging_loss
                    ) / args.logging_steps  # 计算一个logging_step的平均loss
                    logging_loss = tr_loss
                    print(
                        "  epoch: {:d}, global_step: {:d}, train loss: {:.4f}".
                        format(ep, global_step, tr_loss_avg))

                # save model trained by train data & eval data
                if args.save_check_point == global_step:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

                    # Good practice: save your training arguments together with the trained model
                    torch.save(
                        args, os.path.join(args.output_dir,
                                           'training_args.bin'))
                '''
                Eval
                '''
                if args.do_eval and global_step > args.skip_eval_rate * t_total and global_step % args.eval_steps == 0:
                    eval_acc, eval_f1 = evaluate(args, model, eval_data,
                                                 global_step, tr_loss_avg)

                    # save the best performs model
                    if eval_f1 > best_f1:
                        print(
                            "**************** the best f1 is {:.4f} ****************\n"
                            .format(eval_f1))
                        best_f1 = eval_f1
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        model_to_save.save_pretrained(args.output_dir)
                        tokenizer.save_pretrained(args.output_dir)

                        # Good practice: save your training arguments together with the trained model
                        torch.save(
                            args,
                            os.path.join(args.output_dir, 'training_args.bin'))
        '''
        Eval at the end of the train
        '''
        if args.do_eval and global_step > args.skip_eval_rate * t_total and global_step % args.eval_steps == 0:
            eval_acc, eval_f1 = evaluate(args, model, eval_data, global_step)

            # save the best performs model
            if eval_f1 > best_f1:
                print(
                    "**************** the best f1 is {:.4f} ****************\n"
                    .format(eval_f1))
                best_f1 = eval_f1
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)

                # Good practice: save your training arguments together with the trained model
                torch.save(args,
                           os.path.join(args.output_dir, 'training_args.bin'))
    '''
    Test
    '''
    if args.do_test:
        args = torch.load(os.path.join(args.output_dir, 'training_args.bin'))
        config = AutoConfig.from_pretrained(args.output_dir,
                                            num_labels=args.num_labels,
                                            output_hidden_states=True)
        tokenizer = AutoTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model = RobertaForSequenceClassification_MERGE.from_pretrained(
            args.output_dir, config=config)
        model.to(device)

        test_examples, test_features, test_data = get_Dataset(args,
                                                              processor,
                                                              tokenizer,
                                                              mode="test")
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

        print("\n******************** Running test ********************")
        print("  Num examples = {:d}".format(len(test_examples)))
        print("  Batch size = {:d}".format(args.eval_batch_size))

        logits_res = None  # 输出logits用于投票
        pred_res = []
        model.eval()
        for _, batch in enumerate(test_dataloader):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
            }
            if "roberta" in args.model_type:
                inputs["token_type_ids"] = None

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs[0]

                # collect logits output
                if logits_res is None:
                    logits_res = logits
                else:
                    logits_res = torch.cat((logits_res, logits), dim=0)

                # collect label output
                pred_label = predict(logits,
                                     args.label_list)  # 测试时 logits 为outputs[0]
                pred_res.extend(pred_label.tolist())

        # pred_res = np.array(pred_res)
        # ground_truth = np.array(pd.read_pickle("./data/dev.pkl")["label"].tolist())
        # ans = f1_score(y_true=ground_truth, y_pred=pred_res, labels=[0, 1, 2], average="macro")
        # print(ans)

        logits_res = logits_res.detach().cpu().numpy()
        label_0 = logits_res[:, 0].tolist()
        label_1 = logits_res[:, 1].tolist()
        label_2 = logits_res[:, 2].tolist()
        logits_df = pd.DataFrame({
            "label_0": label_0,
            "label_1": label_1,
            "label_2": label_2
        })
        logits_df.to_csv(args.model_type + "_logits_test.csv")
示例#3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--dir_paths", default=None, type=str, nargs="+")
    parser.add_argument("--model_name_or_path", default="biobert_v1.1_pubmed", type=str)

    parser.add_argument("--max_seq_length", default=128, type=int)
    parser.add_argument("--do_train", default=True, type=boolean_string)
    parser.add_argument("--do_eval", default=True, type=boolean_string)
    parser.add_argument("--from_tf", default=False, type=boolean_string)

    parser.add_argument("--train_batch_size", default=32, type=int)
    parser.add_argument("--eval_batch_size", default=32, type=int)

    parser.add_argument("--bert_learning_rate", default=5e-5, type=float)
    parser.add_argument("--not_bert_learning_rate", default=5e-4, type=float)

    parser.add_argument("--num_train_epochs", default=10, type=float)
    parser.add_argument("--warmup_proprotion", default=0.2, type=float)

    parser.add_argument("--seed", type=int, default=2020)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--max_steps", default=-1, type=int)
    parser.add_argument("--do_lower_case", action='store_true')

    # BERT后接LSTM
    parser.add_argument("--need_birnn", default=False, type=boolean_string)
    parser.add_argument("--rnn_dim", default=128, type=int)

    # 增加charCNN
    parser.add_argument("--need_charcnn", default=False, type=boolean_string)
    parser.add_argument("--share_cnn", default=True, type=boolean_string)
    parser.add_argument("--char_embed", default=50, type=int)
    parser.add_argument("--char_out_dim", default=300, type=int)

    # 增加CNN
    parser.add_argument("--need_cnn", default=False, type=boolean_string)
    parser.add_argument("--cnn_out_dim", default=300, type=int)

    # 增加SAC
    parser.add_argument("--need_sac", action="store_true")
    parser.add_argument("--tag_num", default=2, type=int)
    parser.add_argument("--sac_factor", default=100)

    # 调试模式
    parser.add_argument("--debug", action='store_true')

    args = parser.parse_args()

    logger.info("********%s*********", "参数设置")
    logger.info("args info:")
    logger.info(args.__dict__)

    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_
    args.device = device
    args.n_gpu = torch.cuda.device_count() if torch.cuda.is_available() else 0
    logger.info(f"device: {device} n_gpu: {args.n_gpu}")
    set_seed()
    logger.info(f"seed: {args.seed}")

    logger.info("********%s*********", "读取数据")
    data_paths = args.dir_paths
    task_names = list(map(lambda x: x.split(os.path.sep)[-1], data_paths))
    logger.info("task names: %s", str(task_names))
    logger.info("task num: %d", len(task_names))

    processor = NerProcessor(args.debug)
    tasks = [{} for i in range(len(task_names))]
    for i in range(len(task_names)):
        tasks[i]["task_id"] = i
        tasks[i]["task_name"] = task_names[i]
        tasks[i]["train_file"] = os.path.join(data_paths[i], "train_devel.tsv")
        tasks[i]["eval_file"] = os.path.join(data_paths[i], "test.tsv")
        tasks[i]["label_list"] = processor.get_labels(data_paths[i])
        tasks[i]["label2id"] = {l: i for i, l in enumerate(tasks[i]["label_list"])}
        tasks[i]["id2label"] = {value: key for key, value in tasks[i]["label2id"].items()}

    for i in range(len(tasks)):
        logger.info("tasks info %s", str(tasks[i]))

    logger.info("********%s*********", "模型加载")
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
    if args.need_charcnn:
        char2id = {'#': 0, 'I': 1, 'm': 2, 'u': 3, 'n': 4, 'o': 5, 'h': 6, 'i': 7, 's': 8, 't': 9, 'c': 10, 'e': 11,
                   'a': 12, 'l': 13, 'g': 14, 'w': 15, 'p': 16, 'v': 17, 'f': 18, 'r': 19, 'S': 20, '-': 21, '1': 22,
                   '0': 23, '9': 24, 'd': 25, ',': 26, 'H': 27, 'M': 28, 'B': 29, '4': 30, '5': 31, '(': 32, '%': 33,
                   ')': 34, 'y': 35, 'k': 36, 'x': 37, 'b': 38, '.': 39, 'C': 40, 'E': 41, '8': 42, '6': 43, 'V': 44,
                   'j': 45, '2': 46, 'R': 47, 'N': 48, 'A': 49, 'D': 50, 'z': 51, 'O': 52, '<': 53, 'q': 54, 'X': 55,
                   'F': 56, '3': 57, 'G': 58, 'P': 59, ':': 60, '?': 61, 'K': 62, 'W': 63, 'T': 64, "'": 65, 'J': 66,
                   'L': 67, 'U': 68, '+': 69, ';': 70, '7': 71, '/': 72, 'Z': 73, '=': 74, 'Y': 75, 'Q': 76, '[': 77,
                   '"': 78, '>': 79, '*': 80, ']': 81, '&': 82, '$': 83, '_': 84}
    else:
        char2id = None

    config = BertConfig.from_pretrained(args.model_name_or_path)
    model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path, config=config,
                                            char_vocab_size=len(char2id) if char2id is not None else 0,
                                            tag_num=args.tag_num,
                                            char_embedding_dim=args.char_embed,
                                            char_out_dim=args.char_out_dim,
                                            task_infos=tasks,
                                            need_cnn=args.need_cnn,
                                            cnn_out_dim=args.cnn_out_dim,
                                            need_sac=args.need_sac,
                                            sac_factor=args.sac_factor,
                                            need_birnn=args.need_birnn,
                                            need_charcnn=args.need_charcnn,
                                            share_cnn=args.share_cnn,
                                            rnn_dim=args.rnn_dim,
                                            from_tf=args.from_tf,
                                            device=device)

    if args.do_train:
        model.to(device)

        logger.info("********%s*********", "开始读取训练集数据")
        for i in range(len(tasks)):
            tasks[i]["train_examples"], tasks[i]["train_features"], tasks[i]["train_data"] = \
                get_Dataset(args, tasks[i], processor, tokenizer, char2id, mode="train")
            train_sampler = RandomSampler(tasks[i]["train_data"])
            tasks[i]["train_dataloader"] = DataLoader(tasks[i]["train_data"], sampler=train_sampler,
                                                      batch_size=args.train_batch_size)
            tasks[i]["train_ori_words"] = [f.ori_words for f in tasks[i]["train_features"]]
            # print(tasks[i]["train_ori_words"])
        if args.do_eval:
            logger.info("********%s*********", "开始读取验证集数据")
            for i in range(len(tasks)):
                tasks[i]["eval_examples"], tasks[i]["eval_features"], tasks[i]["eval_data"] = get_Dataset(args,
                                                                                                          tasks[i],
                                                                                                          processor,
                                                                                                          tokenizer,
                                                                                                          char2id,
                                                                                                          mode="eval")
                tasks[i]["eval_ori_words"] = [f.ori_words for f in tasks[i]["eval_features"]]

        # t_total = num_train_epochs * len(train_dataloader) / gradient_accumulation_steps
        # t_total表示总共需要更新的次数
        batch_num = sum(list(map(lambda task: len(task["train_dataloader"]), tasks)))
        t_total = args.num_train_epochs * batch_num // args.gradient_accumulation_steps

        no_decay = ['bias', 'LayerNorm.weight']
        # 最原始的设置
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        # ]

        # 为bert和非bert设置不同的学习率
        optimizer_grouped_parameters = [
            # in bert
            {'params': [p for n, p in model.named_parameters() if "bert" in n and not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01, "lr": args.bert_learning_rate},
            {'params': [p for n, p in model.named_parameters() if "bert" in n and any(nd in n for nd in no_decay)],
             'weight_decay': 0.0, "lr": args.bert_learning_rate},
            {'params': [p for n, p in model.named_parameters() if "bert" not in n], 'weight_decay': 0.0,
             "lr": args.not_bert_learning_rate}
        ]

        # 改用BertAdam,这个优化器尽可能地模拟了原始tensorflow版bert的优化器
        optimizer = BertAdam(optimizer_grouped_parameters, warmup=args.warmup_proprotion, t_total=t_total)

        # 开始训练
        logger.info("********%s*********", "开始训练")
        logger.info("# of tasks: %d", len(tasks))
        logger.info(" Num Epochs = %d", args.num_train_epochs)
        logger.info(" Total Optimization Steps = %d", t_total)
        for task in tasks:
            logger.info(" Task name: %s Num Examples %d", task["task_name"], len(task["train_dataloader"]))

        step = 0
        total_loss = 0
        update_step = 0

        for ep in range(1, int(args.num_train_epochs) + 1):
            model.train()
            task_indexs = [i for i in range(len(tasks))]
            iter_train_dataloaders = list(map(lambda x: iter(x["train_dataloader"]), tasks))
            while True:
                if len(task_indexs) == 0:
                    break
                task_id = random.choice(task_indexs)
                task_id = torch.tensor(task_id, dtype=torch.long).to(device)
                batch = next(iter_train_dataloaders[task_id], None)
                if batch is None:
                    task_indexs.remove(task_id)
                    continue
                batch = tuple(t.to(device) for t in batch)

                if args.need_charcnn:
                    input_word_ids, input_mask, label_ids, label_mask, char_ids = batch
                else:
                    input_word_ids, input_mask, label_ids, label_mask = batch
                    char_ids = None

                if args.need_sac:
                    O_label = label_ids == tasks[task_id]["label2id"]["O"]
                    pad_label = label_ids == 0
                    tag_mask = 1 - (O_label + pad_label).long().to(device)
                else:
                    tag_mask = None

                loss = model(task_id, input_word_ids, input_mask, label_ids, char_ids, sac_mask=tag_mask)

                loss.backward()
                total_loss += loss.item()

                step += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    model.zero_grad()
                    update_step = update_step + 1
                    if update_step % 100 == 0:
                        logger.info("in ep %d, choose task: %d, loss %f", ep, task_id, loss)
            if args.do_eval:
                for task in tasks:
                    logger.info("Evalating task %s, Train set", task["task_name"])
                    train_filename, test_filename = None, None
                    if ep == args.num_train_epochs:
                        train_filename = task["task_name"] + ".train.output.txt"
                        test_filename = task["task_name"] + ".test.output.txt"
                    # evaluate(args, task["task_id"], task["train_data"], model, task["id2label"], task["train_ori_words"], file_name=train_filename)
                    logger.info("Evalating task %s, Eval set", task["task_name"])
                    evaluate(args, task["task_id"], task["eval_data"], model, task["id2label"], task["eval_ori_words"], file_name=test_filename)