예제 #1
0
def predict(predict_model_name_or_path, pre_data, pre_dataloader):

    print('进行预测')
    pro = processer()
    labellist = pro.get_labels()

    #*****加载模型*****
    print('加载模型')
    model = BertForSequenceClassification
    config = BertConfig.from_pretrained(predict_model_name_or_path,
                                        num_labels=len(labellist))
    model = model.from_pretrained(predict_model_name_or_path, config=config)

    print('模型加载到GPU或者CPU')
    #如果有GPU,使用GPU进行分布式计算,否则使用CPU
    if torch.cuda.is_available():
        #单GPU计算
        torch.cuda.set_device(0)
        device = torch.device('cuda', 0)  #设置GPU设备号
    else:
        device = torch.device('cpu')
    model.to(device)

    print('******** Running prediction ********')
    print("  Num examples = %d", len(pre_data))

    preds = None
    pbar = ProgressBar(n_total=len(pre_dataloader), desc="Predicting")

    #***进行预测***
    for step, batch in enumerate(pre_dataloader):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'token_type_ids': batch[2],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            outputs = model(**inputs)
            _, logits = outputs[:2]

        #***汇总每个batch的预测结果***
        if preds is None:
            preds = logits.softmax(-1).detach().cpu().numpy()
        else:
            preds = np.append(preds,
                              logits.softmax(-1).detach().cpu().numpy(),
                              axis=0)
        pbar(step)

    predict_label = np.argmax(preds, axis=1)
    print(preds)

    print(predict_label)
    return preds, predict_label
def load_dataset(args, model_name_or_path, type):

    #仅用于定义变量
    input_file_name_or_path = ''
    max_seq_len = 0
    batch_size = 0

    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    pro = processer()
    labellist = pro.get_labels()

    if type == 'train':
        input_file_name_or_path = os.path.join(args.train_file_path,
                                               'train.txt')
        max_seq_len = args.train_max_seq_len
        batch_size = args.train_batch_size

    elif type == 'valid':
        input_file_name_or_path = os.path.join(args.valid_file_path,
                                               'valid.txt')
        max_seq_len = args.valid_max_seq_len
        batch_size = args.valid_batch_size

    elif type == 'test':
        input_file_name_or_path = os.path.join(args.predict_file_path,
                                               'predict.txt')
        max_seq_len = args.predict_max_seq_len
        batch_size = args.predict_batch_size

    data = pro.read_txt(filename=input_file_name_or_path)
    examples = pro.create_examples(data=data, type=type)
    features = pro.convert_examples_to_features(examples=examples,
                                                tokenizer=tokenizer,
                                                max_length=max_seq_len,
                                                label_list=labellist,
                                                output_mode='classification')
    dataset = pro.create_dataset(features=features)

    sampler = SequentialSampler(dataset)  #顺序取样
    dataloader = DataLoader(dataset=dataset,
                            sampler=sampler,
                            batch_size=batch_size,
                            collate_fn=collate_fn)

    return data, dataloader
예제 #3
0
def split_data(args, file_name_or_path):

    pro = processer()
    labellist = pro.get_labels()

    file_name_or_path = os.path.join(file_name_or_path, 'data.txt')
    if not os.path.exists(file_name_or_path):
        os.makedirs(file_name_or_path)
    with open(file_name_or_path, 'r') as rf:
        lines = rf.readlines()

    train_samples = []
    valid_samples = []
    for label in labellist:
        #***获取每种标签的数据***
        samples = [
            line for line in lines
            if line.split('\t')[3].replace('\n', '') == label
        ]
        np.random.shuffle(samples)

        #***每种标签数据切分成训练集和验证集***
        valid_size = int(len(samples) * args.valid_size)  #会出现小数,所以需要int
        train_sample = samples[valid_size:]
        valid_sample = samples[:valid_size]

        train_samples.extend(train_sample)
        valid_samples.extend(valid_sample)

    #***添加了每种类别的数据以后将数据打乱***
    np.random.shuffle(train_samples)
    np.random.shuffle(valid_samples)

    #***写入文件***
    with open(f'{args.train_file_path}/train.txt', 'w') as wf:
        for line in train_samples:
            wf.write(line)

    with open(f'{args.train_file_path}/valid.txt', 'w') as wf:
        for line in valid_samples:
            wf.write(line)
def train(args, model_name_or_path, train_data, train_dataloader, valid_data,
          valid_dataloader):

    pro = processer()
    labellist = pro.get_labels()
    trainloss = TrainLoss()

    #*****加载模型*****
    model = BertForSequenceClassification
    config = BertConfig.from_pretrained(model_name_or_path,
                                        num_labels=len(labellist))
    model = model.from_pretrained(model_name_or_path, config=config)

    # *****模型加载到设备*****
    if torch.cuda.is_available():
        # 单GPU计算
        torch.cuda.set_device(0)
        device = torch.device('cuda', 0)  # 设置GPU设备号
    else:
        device = torch.device('cpu')
    model.to(device)

    #*****优化函数*****
    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    warmup_steps = int(t_total * args.warmup_proportion)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=t_total)

    #*****训练过程相关信息*****
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_data))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    #*****开始训练*****
    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    seed_everything(args.seed)

    for num in range(args.num_train_epochs):
        train_all_steps = 0
        train_steps = []
        train_losses = []

        global_step = 0
        logger.info(f'****************Train epoch-{num}****************')
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Train')
        for step, batch in enumerate(train_dataloader):
            #***存储step用于绘制Loss曲线***
            train_all_steps += 1
            train_steps.append(train_all_steps)

            model.train()

            #***输入模型进行计算***
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3]
            }
            outputs = model(
                **inputs)  #模型原文件中已经使用损失函数对输出值和标签值进行了计算,返回的outputs中包含损失函数值

            #***损失函数值反向传播***
            loss = outputs[0]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)  #梯度裁剪

            #***存储loss用于绘制loss曲线***
            train_losses.append(loss.detach().cpu().numpy())

            #***优化器进行优化***
            pbar(step, {'loss': loss.item()})
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  #优化器优化
                scheduler.step()  #学习率机制更新
                model.zero_grad()
                global_step += 1

        #训练一个epoch保存一个模型
        output_dir = os.path.join(args.output_dir,
                                  f'model_checkpoint_epoch_{num}')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print('')  #避免输出信息都在同一行
        # logger.info(f'save model checkpoint-{global_step} to {output_dir} ')
        model.save_pretrained(output_dir)  #保存模型

        #***训练一个epoch绘制一个Loss曲线***
        trainloss.train_loss(steps=train_steps,
                             losses=train_losses,
                             epoch=num,
                             args=args,
                             type='train',
                             max_step=train_all_steps)

        #*****一个epoch训练结束以后,进行验证*****
        print('')
        logger.info(f'****************Valid epoch-{num}****************')
        logger.info("  Num examples = %d", len(valid_data))
        logger.info("  Batch size = %d", args.valid_batch_size)
        valid_steps, valid_losses, valid_all_steps = valid(
            args=args,
            model=model,
            device=device,
            valid_data=valid_data,
            valid_dataloader=valid_dataloader)
        trainloss.train_loss(steps=valid_steps,
                             losses=valid_losses,
                             epoch=num,
                             args=args,
                             type='valid',
                             max_steps=valid_all_steps)

        #每训练一个epoch清空cuda缓存
        if 'cuda' in str(device):
            torch.cuda.empty_cache()