def do_infer(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    config = BertConfig.from_pretrained(args.name)
    tokenizer = BertTokenizer.from_pretrained(args.name)
    cpe = ChineseAndPunctuationExtractor()
    label_map, id2label = json.load(
        open("../origin_data/labels2idx.json", encoding="utf8"))
    config.num_labels = len(label_map)
    features = from_file("../origin_data/duie_test1.json", tokenizer,
                         label_map, cpe, args.max_length)
    bert = BertForTokenClassification.from_pretrained("./model_2.pt",
                                                      config=config).to(device)
    infer_generator = batch_generator(features, args.batch_size, False)
    predict_logits = []
    for step, batch in enumerate(infer_generator):
        batch_input_ids = batch[0].to(device=device)
        batch_input_mask = batch[1].to(device=device)
        batch_type_ids = batch[2].to(device=device)
        outputs = bert(batch_input_ids, batch_input_mask, batch_type_ids)
        batch_predict = torch.sigmoid(outputs.logits).cpu().detach().numpy()
        predict_logits.append(batch_predict)

    predict_logits = np.concatenate(predict_logits, axis=0)
    assert predict_logits.shape[0] == len(features)
    predict_logits[predict_logits >= 0.3] = 1
    predict_logits[predict_logits < 0.3] = 0
    predict_logits = predict_logits.astype(np.int).tolist()
    result = decoding(features, predict_logits, id2label)
    with open("result.json", "w", encoding="utf8") as outp:
        for line in result:
            outp.write(json.dumps(line, ensure_ascii=False) + "\n")
Пример #2
0
        # 查看现在使用的设备
        print('current device:', torch.cuda.current_device())
        n_gpu = 1
        params.n_gpu = n_gpu

    # Set the random seed for reproducible experiments
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    params.seed = args.seed
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Set the logger
    utils.set_logger(save=True, log_path=os.path.join(params.params_path, 'train.log'))
    logging.info("Model type: ")
    logging.info("device: {}".format(params.device))

    logging.info('Init pre-train model...')
    bert_config = BertConfig.from_json_file(os.path.join(params.bert_model_dir, 'bert_config.json'))
    model = BertForTokenClassification(config=bert_config, params=params)
    nezha_utils.torch_init_model(model, os.path.join(params.bert_model_dir, 'pytorch_model.bin'))
    # 保存bert config
    model.to(params.device)
    if params.n_gpu > 1 and args.multi_gpu:
        model = torch.nn.DataParallel(model)
    logging.info('-done')

    # Train and evaluate the model
    logging.info("Starting training for {} epoch(s)".format(args.epoch_num))
    train_and_evaluate(model, params, args.restore_file)
Пример #3
0
        # 查看现在使用的设备
        print('current device:', torch.cuda.current_device())
        n_gpu = 1
        params.n_gpu = n_gpu

    # Set the random seed for reproducible experiments
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    params.seed = args.seed
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Set the logger
    utils.set_logger(save=True,
                     log_path=os.path.join(params.params_path, 'train.log'))
    logging.info("Model type: ")
    logging.info("device: {}".format(params.device))

    logging.info('Init pre-train model...')
    model = BertForTokenClassification.from_pretrained(params.bert_model_dir,
                                                       params=params)
    # 保存bert config
    model.to(params.device)
    if params.n_gpu > 1 and args.multi_gpu:
        model = torch.nn.DataParallel(model)
    logging.info('-done')

    # Train and evaluate the model
    logging.info("Starting training for {} epoch(s)".format(args.epoch_num))
    train_and_evaluate(model, params, args.restore_file)
Пример #4
0
    params.seed = args.seed
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Set the logger
    utils.set_logger(save=True,
                     log_path=os.path.join(params.params_path, 'train.log'))
    logging.info(
        f"Model type: {params.pre_model_type}_{params.ds_encoder_type}_CRF")
    logging.info("device: {}".format(params.device))

    logging.info('Init pre-train model...')
    if params.pre_model_type == 'NEZHA':
        bert_config = NEZHAConfig.from_json_file(
            os.path.join(params.bert_model_dir, 'bert_config.json'))
        model = BertForTokenClassification(config=bert_config, params=params)
        # NEZHA init
        torch_init_model(
            model, os.path.join(params.bert_model_dir, 'pytorch_model.bin'))
    elif params.pre_model_type == 'RoBERTa':
        bert_config = BertConfig.from_json_file(
            os.path.join(params.bert_model_dir, 'bert_config.json'))
        model = BertForTokenClassification.from_pretrained(
            config=bert_config,
            pretrained_model_name_or_path=params.bert_model_dir,
            params=params)
    else:
        raise ValueError(
            'Pre-train Model type must be NEZHA or ELECTRA or RoBERTa!')
    logging.info('-done')
def do_train(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = BertTokenizer.from_pretrained(args.name)
    cpe = ChineseAndPunctuationExtractor()
    label_map, _ = json.load(
        open("../origin_data/labels2idx.json", encoding="utf8"))
    train_features = from_file("../origin_data/duie_train.json", tokenizer,
                               label_map, cpe, args.max_length)
    dev_features = from_file("../origin_data/duie_dev.json", tokenizer,
                             label_map, cpe, args.max_length)
    counts = len(train_features)
    logger.info(
        f"Train dataset size: {counts}, Dev dataset size: {len(dev_features)}")
    if len(train_features) % args.batch_size == 0:
        one_epoch_steps = len(train_features) // args.batch_size
    else:
        one_epoch_steps = len(train_features) // args.batch_size + 1
    total_steps = one_epoch_steps * args.epochs
    logger.info(f"Training step: {total_steps}")

    bert = BertForTokenClassification.from_pretrained(
        args.name, num_labels=len(label_map)).to(device)
    optimizer = AdamW(params=bert.parameters(), lr=args.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(
                                                    total_steps * 0.1),
                                                num_training_steps=total_steps)

    best_eval_f1 = 0
    min_eval_loss = float("inf")
    for epoch in range(args.epochs):
        train_generator = batch_generator(train_features, args.batch_size)
        valid_generator = batch_generator(dev_features, args.batch_size, False)
        logger.info(f"======== Epoch {epoch + 1:} / {args.epochs:} ========")
        logger.info("Training...")
        bert.train()
        start_train = time.time()
        total_train_loss = 0
        for step, batch in enumerate(train_generator):
            batch_input_ids = batch[0].to(device=device)
            batch_input_mask = batch[1].to(device=device)
            batch_type_ids = batch[2].to(device=device)
            batch_labels = batch[3].to(device=device)
            outputs = bert(batch_input_ids,
                           batch_input_mask,
                           batch_type_ids,
                           labels=batch_labels)
            bert.zero_grad()
            outputs.loss.backward()
            torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_train_loss += outputs.loss.item()

            if step % 100 == 0:
                logger.info(
                    f"  Step: {step+1:>5}/{one_epoch_steps:>1}, current loss: {outputs.loss.item():.6f}"
                )

        average_train_loss = total_train_loss / (step + 1)
        trainingtime = time.time() - start_train
        logger.info(
            f"  Average training BCELoss: {average_train_loss:.6f}; Take time: {trainingtime:.3f}"
        )

        logger.info("Running Validation...")
        bert.eval()
        start_eval = time.time()
        total_eval_loss = 0
        total_eval_f1 = 0
        for step, batch in enumerate(valid_generator):
            batch_input_ids = batch[0].to(device=device)
            batch_input_mask = batch[1].to(device=device)
            batch_type_ids = batch[2].to(device=device)
            batch_labels = batch[3].to(device=device)
            with torch.no_grad():
                outputs = bert(batch_input_ids,
                               batch_input_mask,
                               batch_type_ids,
                               labels=batch_labels)
                total_eval_loss += outputs.loss.item()
                # total_eval_f1+=metric(outputs.logits, batch_labels)
        average_eval_loss = total_eval_loss / (step + 1)
        # average_eval_f1=total_eval_f1/(step+1)
        validation_time = time.time() - start_eval
        logger.info(
            f"  Average eval BCELoss: {average_eval_loss:.6f}; Take time: {validation_time:.3f}"
        )

        # if average_eval_f1>best_eval_f1:
        #     best_eval_f1=average_eval_f1
        #     logger.info("   Save model...")
        #     torch.save(bert.state_dict(),f"model_{epoch}.pt")
        if average_eval_loss < min_eval_loss:
            min_eval_loss = average_eval_loss
            logger.info("  Save model...")
            torch.save(bert.state_dict(), f"model_{epoch}.pt")
Пример #6
0
def train(train_iter, test_iter, config):
    """"""
    # Prepare model
    # Prepare model
    # reload weights from restore_file if specified  如果指定就加载已经训练的权重
    if config.pretrainning_model == 'nezha':  #哪吒模型
        Bert_config = BertConfig.from_json_file(config.bert_config_file)
        model = BertForTokenClassification(config=Bert_config, params=config)
        nezha_utils.torch_init_model(model, config.bert_file)
    elif config.pretrainning_model == 'albert':
        Bert_config = AlbertConfig.from_pretrained(config.model_path)
        model = BertForTokenClassification.from_pretrained(config.model_path,
                                                           config=Bert_config)
    else:
        Bert_config = RobertaConfig.from_pretrained(config.bert_config_file,
                                                    output_hidden_states=True)
        model = BertForTokenClassification.from_pretrained(
            config=Bert_config,
            params=config,
            pretrained_model_name_or_path=config.model_path)

    Bert_config.output_hidden_states = True  # 获取每一层的输出

    model.to(device)
    """多卡训练"""
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # optimizer
    # Prepare optimizer
    # fine-tuning
    # 取模型权重
    param_optimizer = list(model.named_parameters())
    # pretrain model param       预训练的参数
    param_pre = [(n, p) for n, p in param_optimizer
                 if 'bert' in n or 'electra' in n]  # nezha的命名为bert
    # middle model param         中等参数
    param_middle = [
        (n, p) for n, p in param_optimizer
        if not any([s in n for s in ('bert', 'crf', 'electra',
                                     'albert')]) or 'dym_weight' in n
    ]
    # crf param
    # 不进行衰减的权重
    no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm']
    # 将权重分组
    optimizer_grouped_parameters = [
        # pretrain model param  预训练的参数
        # 衰减
        {
            'params':
            [p for n, p in param_pre if not any(nd in n for nd in no_decay)],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.embed_learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_pre if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': config.embed_learning_rate
        },
        # middle model     中等参数
        # 衰减
        {
            'params': [
                p for n, p in param_middle
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_middle if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0,
            'lr':
            config.learning_rate
        },
    ]
    num_train_optimization_steps = train_iter.num_records // config.gradient_accumulation_steps * config.train_epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         warmup=config.warmup_proportion,
                         schedule="warmup_cosine",
                         t_total=num_train_optimization_steps)
    logger.info("***** Running training *****")
    logger.info("  Batch size = %d", config.batch_size)
    logger.info("  Num epochs = %d", config.train_epoch)
    logger.info("  Learning rate = %f", config.learning_rate)

    cum_step = 0
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(
        os.path.join(config.save_model, "runs_" + str(gpu_id), timestamp))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print("Writing to {}\n".format(out_dir))

    draw_step_list = []
    draw_loss_list = []
    for i in range(config.train_epoch):
        model.train()
        for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm(
                train_iter):
            # 转成张量
            loss = model(input_ids=list2ts2device(input_ids_list),
                         token_type_ids=list2ts2device(segment_ids_list),
                         attention_mask=list2ts2device(input_mask_list),
                         labels=list2ts2device(label_ids_list))
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # 梯度累加
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps

            if cum_step % 10 == 0:
                draw_step_list.append(cum_step)
                draw_loss_list.append(loss)
                if cum_step % 100 == 0:
                    format_str = 'step {}, loss {:.4f} lr {:.5f}'
                    print(
                        format_str.format(cum_step, loss,
                                          config.learning_rate))

            loss.backward()  # 反向传播,得到正常的grad
            if (cum_step + 1) % config.gradient_accumulation_steps == 0:
                # performs updates using calculated gradients
                optimizer.step()
                model.zero_grad()
            cum_step += 1
        p, r, f1 = set_test(model, test_iter)
        # lr_scheduler学习率递减 step

        print('dev set : step_{},precision_{}, recall_{}, F1_{}'.format(
            cum_step, p, r, f1))

        # 保存模型
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            os.path.join(
                out_dir, 'model_{:.4f}_{:.4f}_{:.4f}_{}.bin'.format(
                    p, r, f1, str(cum_step))))
        torch.save(model_to_save, output_model_file)

    with open(Config().processed_data + 'step_loss_data.pickle', 'wb') as mf:
        draw_dict = {'step': draw_step_list, 'loss': draw_loss_list}
        pickle.dump(draw_dict, mf)
Пример #7
0
    else:
        # 设置模型使用的gpu
        torch.cuda.set_device(args.device_id)
        # 查看现在使用的设备
        print('current device:', torch.cuda.current_device())
        n_gpu = 1
        params.n_gpu = n_gpu

    # Set the random seed for reproducible experiments
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    params.seed = args.seed
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Set the logger
    utils.set_logger(save=True, log_path=os.path.join(params.params_path, 'train.log'))
    logging.info("Model type: ")
    logging.info("device: {}".format(params.device))

    logging.info('Init pre-train model...')
    bert_config = NEZHAConfig.from_json_file(os.path.join(params.bert_model_dir, 'bert_config.json'))
    model = BertForTokenClassification(config=bert_config, params=params)
    # NEZHA init
    torch_init_model(model, os.path.join(params.bert_model_dir, 'pytorch_model.bin'))
    logging.info('-done')

    # Train and evaluate the model
    logging.info("Starting training for {} epoch(s)".format(args.epoch_num))
    train_and_evaluate(model, params, args.restore_file)