示例#1
0
class Model(nn.Module):
    CLS_POSITION = 0

    def __init__(self, cfg=None, mdo_prob=0., mdo_num=1, num_classes=1, path=None):
        # unnecessary head is present
        super().__init__()
        if path is not None:
            self.backbone = BertForSequenceClassification.from_pretrained(path)
            self.backbone.config.output_hidden_states = True
        else:
            assert cfg is not None, 'Config should be provided if no pretrained path was specified.'
            self.backbone = BertForSequenceClassification(cfg)
        self.head = nn.Linear(self.backbone.config.hidden_size, num_classes)

        weights_init = torch.zeros(self.backbone.config.num_hidden_layers).float()
        self.cls_weights = torch.nn.Parameter(weights_init, requires_grad=True)

        self.mdo = None
        if mdo_prob > 0.:
            self.mdo = MultiDropoutHead(mdo_prob, mdo_num)

    def forward(self, x, attention_mask):
        _, hidden_states = self.backbone(x, attention_mask)
        hidden_states = torch.stack([states[:, self.CLS_POSITION] for states in hidden_states[1:]])
        x = torch.einsum('ijk,i->jk', hidden_states, torch.softmax(self.cls_weights, dim=-1))

        if self.mdo is not None:
            return self.mdo(x, self.head)

        return self.head(x)

    def load_weights(self, path):
        found = []
        with open(path, 'rb') as f:
            weights = torch.load(f)
        for name, param in weights['model'].items():
            if name in self.backbone.state_dict() and 'cls' not in name:
                if param.shape == self.backbone.state_dict()[name].shape:
                    self.backbone.state_dict()[name].copy_(param)
                    logger.info(f'\t Preloading {name}')
                    found.append(name)

        logger.info('\n\t Didnt find layers:')
        for name in self.backbone.state_dict():
            if name not in weights['model']:
                logger.info(f'\t {name}')

        return found
示例#2
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForSequenceClassification(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path,
                                      output_folder):
    # Instantiate model
    logger.info(f'Loading model based on config from {config_path}...')
    config = BertConfig.from_json_file(config_path)
    model = BertForSequenceClassification(config)

    # Load weights from checkpoint
    logger.info(f'Loading weights from checkpoint {tf_checkpoint_path}...')
    load_tf2_weights_in_bert(model, tf_checkpoint_path, config)

    # Create dirs
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)

    # Save pytorch-model
    f_out_model = os.path.join(output_folder, 'pytorch_model.bin')
    logger.info(f'Saving PyTorch model to {f_out_model}...')
    torch.save(model.state_dict(), f_out_model)

    # Save config to output
    f_out_config = os.path.join(output_folder, 'config.json')
    logger.info(f'Saving config to {f_out_config}...')
    config.to_json_file(f_out_config)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/',
                        type=str,
                        required=False,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--task_name",
                        default='ubuntu',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/lujunyu/model/chatbert/ubuntu_without_pretraining/',
                        type=str,
                        required=False,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--init_model_name",
                        default='bert-base-uncased',
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case",
                        default=True,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--data_augmentation",
                        default=False,
                        action='store_true',
                        help="Whether to use augmentation")
    parser.add_argument("--max_seq_length",
                        default=256,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--train_batch_size",
                        default=500,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=500,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-3,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps",
                        default=0.0,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=1e-3,
                        type=float,
                        help="weight_decay")
    parser.add_argument("--save_checkpoints_steps",
                        default=8000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=20,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case)
    if args.data_augmentation:
        train_dataset = UbuntuDatasetForSP(
            file_path=os.path.join(args.data_dir, "train_augment_3.txt"),
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer
        )
    else:
        train_dataset = UbuntuDatasetForSP(
            file_path=os.path.join(args.data_dir, "train.txt"),
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer
        )
    eval_dataset = UbuntuDatasetForSP(
        file_path=os.path.join(args.data_dir, "valid.txt"),
        max_seq_length=args.max_seq_length,
        tokenizer=tokenizer
    )

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size,
                                                sampler=RandomSampler(train_dataset), num_workers=4)
    eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=args.eval_batch_size,
                                                sampler=SequentialSampler(eval_dataset), num_workers=4)

    model = BertForSequenceClassification(config=bert_config)
    model.to(device)

    num_train_steps = None
    if args.do_train:
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        # remove pooler, which is not used thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay': args.weight_decay}, {
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0}]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps)
    else:
        optimizer = None
        scheduler = None

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    best_metric = 0.0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enought gradients
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                if step % args.save_checkpoints_steps == 0:
                    model.eval()
                    f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w')
                    eval_loss = 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    logits_all = []
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)

                        logits = logits.detach().cpu().numpy()
                        logits_all.append(logits)
                        label_ids = label_ids.cpu().numpy()

                        for logit, label in zip(logits, label_ids):
                            logit = '{},{}'.format(logit[0], logit[1])
                            f.write('_\t{}\t{}\n'.format(logit, label))

                        eval_loss += tmp_eval_loss.mean().item()

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    f.close()
                    logits_all = np.concatenate(logits_all,axis=0)
                    eval_loss = eval_loss / nb_eval_steps

                    result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt'))
                    result.update({'eval_loss': eval_loss})

                    output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt")
                    with open(output_eval_file, "a") as writer:
                        logger.info("***** Eval results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))

                    ### Save the best checkpoint
                    if best_metric < result['R10@1'] + result['R10@2']:
                        try:  ### Remove 'module' prefix when using DataParallel
                            state_dict = model.module.state_dict()
                        except AttributeError:
                            state_dict = model.state_dict()
                        torch.save(state_dict, os.path.join(args.output_dir, "model.pt"))
                        best_metric = result['R10@1'] + result['R10@2']
                        logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt")))

                        ### visualize bad cases of the best model
                        # logger.info('Saving Bad cases...')
                        # visualize_bad_cases(
                        #     logits=logits_all,
                        #     input_file_path=os.path.join(args.data_dir, 'valid.txt'),
                        #     output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt')
                        # )

                    model.train()
示例#5
0
def train_process(config, train_load, valid_load, test_load, k, train_sampler):

    # load source bert weights
    # model_config = BertConfig.from_pretrained(pretrained_model_name_or_path="../user_data/bert_source/{}/config.json".format(config.model_name))
    model_config = BertConfig()
    model_config.vocab_size = len(
        pd.read_csv('../user_data/vocab', names=["score"]))
    model = BertForSequenceClassification(config=model_config)

    if os.path.isfile('save_model/{}_best_model_v1111.pth.tar'.format(
            config.model_name)):
        checkpoint = torch.load('save_model/{}_best_model_v1.pth.tar'.format(
            config.model_name),
                                map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['status'], strict=False)
        best_dev_auc = 0
        print('***********load best model weight*************')
    else:
        checkpoint = torch.load(
            '../user_data/save_bert/{}_checkpoint.pth.tar'.format(
                config.model_name),
            map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['status'], strict=False)
        best_dev_auc = 0
        print('***********load pretrained mlm model weight*************')

    for param in model.parameters():
        param.requires_grad = True

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    #     t_total = len(train_load) * config.num_train_epochs
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total
    #     )

    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    if config.fgm:
        fgm = FGM(model)

    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)
        is_best = False
        torch.cuda.empty_cache()

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss
            model.zero_grad()
            loss.backward()
            #             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

            if config.fgm:
                fgm.attack()  # 在embedding上添加对抗扰动
                loss_adv = model(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=label).loss
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                fgm.restore()  # 恢复embedding参数

            optimizer.step()
        #             scheduler.step()

        dev_auc = model_evaluate(config, model, valid_load)

        # 同步各个进程的速度,计算分布式loss
        torch.distributed.barrier()
        reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item()

        if reduce_dev_auc > best_dev_auc:
            best_dev_auc = reduce_dev_auc
            is_best = True

        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        msg = 'number {} fold,time:{},epoch:{}/{},reduce_dev_auc:{},best_dev_auc:{}'

        if config.local_rank in [0, -1]:
            print(
                msg.format(k, now, epoch + 1, config.num_train_epochs,
                           reduce_dev_auc, best_dev_auc))
            checkpoint = {
                "status": model.state_dict(),
                "epoch": epoch + 1,
                'reduce_dev_auc': reduce_dev_auc
            }
            if is_best:
                torch.save(
                    checkpoint, '../user_data/save_model' + os.sep +
                    '{}_best_model.pth.tar'.format(config.model_name))
            torch.save(
                checkpoint, '../user_data/save_model' + os.sep +
                '{}_checkpoint.pth.tar'.format(config.model_name))
            del checkpoint

    torch.distributed.barrier()
    return data


if __name__ == '__main__':
    args = create_args()

    # load tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case,
                                           piece=args.piece,
                                           piece_model=args.piece_model)

    # load bert model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForSequenceClassification(config)
    model_state_dict = model.state_dict()
    print('Model parameter: {}'.format(
        sum(p.numel() for k, p in model_state_dict.items())))
    pre_state_dict = torch.load(args.pretrained_file)
    pre_state_dict = {
        k: v
        for k, v in pre_state_dict.items() if k in model_state_dict
    }
    model_state_dict.update(pre_state_dict)
    model.load_state_dict(model_state_dict)
    if args.cuda:
        model.cuda()

    # load data
    data = BERTCLDCDataReader(args, tokenizer)
示例#7
0
            label_list = list(map(json.loads, label))

            text_tensor = torch.tensor(text_list).to(device)
            label_tensor = torch.tensor(label_list).to(device)

            outputs = model(text_tensor, labels=label_tensor)
            loss, logits = outputs[:2]
            optimizer.zero_grad()
            loss.backward()
            scheduler.step()
            optimizer.step()

            acc = batch_accuracy(logits, label_tensor)
            print('epoch:{} | acc:{} | loss:{}'.format(epoch, acc, loss))

    torch.save(model.state_dict(), 'bert_cla.ckpt')
    print('保存训练完成的model...')

    # 测试

    print('开始加载训练完成的model...')
    model.load_state_dict(torch.load('bert_cla.ckpt'))

    print('开始测试...')
    model.eval()
    test_result = []
    for item in test_dataset:

        text_list = list(json.loads(item[1]))
        text_tensor = torch.tensor(text_list).unsqueeze(0).to(device)