def predict(predict_model_name_or_path, pre_data, pre_dataloader): print('进行预测') pro = processer() labellist = pro.get_labels() #*****加载模型***** print('加载模型') model = BertForSequenceClassification config = BertConfig.from_pretrained(predict_model_name_or_path, num_labels=len(labellist)) model = model.from_pretrained(predict_model_name_or_path, config=config) print('模型加载到GPU或者CPU') #如果有GPU,使用GPU进行分布式计算,否则使用CPU if torch.cuda.is_available(): #单GPU计算 torch.cuda.set_device(0) device = torch.device('cuda', 0) #设置GPU设备号 else: device = torch.device('cpu') model.to(device) print('******** Running prediction ********') print(" Num examples = %d", len(pre_data)) preds = None pbar = ProgressBar(n_total=len(pre_dataloader), desc="Predicting") #***进行预测*** for step, batch in enumerate(pre_dataloader): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'token_type_ids': batch[2], 'attention_mask': batch[1], 'labels': batch[3] } outputs = model(**inputs) _, logits = outputs[:2] #***汇总每个batch的预测结果*** if preds is None: preds = logits.softmax(-1).detach().cpu().numpy() else: preds = np.append(preds, logits.softmax(-1).detach().cpu().numpy(), axis=0) pbar(step) predict_label = np.argmax(preds, axis=1) print(preds) print(predict_label) return preds, predict_label
def load_dataset(args, model_name_or_path, type): #仅用于定义变量 input_file_name_or_path = '' max_seq_len = 0 batch_size = 0 tokenizer = BertTokenizer.from_pretrained(model_name_or_path) pro = processer() labellist = pro.get_labels() if type == 'train': input_file_name_or_path = os.path.join(args.train_file_path, 'train.txt') max_seq_len = args.train_max_seq_len batch_size = args.train_batch_size elif type == 'valid': input_file_name_or_path = os.path.join(args.valid_file_path, 'valid.txt') max_seq_len = args.valid_max_seq_len batch_size = args.valid_batch_size elif type == 'test': input_file_name_or_path = os.path.join(args.predict_file_path, 'predict.txt') max_seq_len = args.predict_max_seq_len batch_size = args.predict_batch_size data = pro.read_txt(filename=input_file_name_or_path) examples = pro.create_examples(data=data, type=type) features = pro.convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_length=max_seq_len, label_list=labellist, output_mode='classification') dataset = pro.create_dataset(features=features) sampler = SequentialSampler(dataset) #顺序取样 dataloader = DataLoader(dataset=dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn) return data, dataloader
def split_data(args, file_name_or_path): pro = processer() labellist = pro.get_labels() file_name_or_path = os.path.join(file_name_or_path, 'data.txt') if not os.path.exists(file_name_or_path): os.makedirs(file_name_or_path) with open(file_name_or_path, 'r') as rf: lines = rf.readlines() train_samples = [] valid_samples = [] for label in labellist: #***获取每种标签的数据*** samples = [ line for line in lines if line.split('\t')[3].replace('\n', '') == label ] np.random.shuffle(samples) #***每种标签数据切分成训练集和验证集*** valid_size = int(len(samples) * args.valid_size) #会出现小数,所以需要int train_sample = samples[valid_size:] valid_sample = samples[:valid_size] train_samples.extend(train_sample) valid_samples.extend(valid_sample) #***添加了每种类别的数据以后将数据打乱*** np.random.shuffle(train_samples) np.random.shuffle(valid_samples) #***写入文件*** with open(f'{args.train_file_path}/train.txt', 'w') as wf: for line in train_samples: wf.write(line) with open(f'{args.train_file_path}/valid.txt', 'w') as wf: for line in valid_samples: wf.write(line)
def train(args, model_name_or_path, train_data, train_dataloader, valid_data, valid_dataloader): pro = processer() labellist = pro.get_labels() trainloss = TrainLoss() #*****加载模型***** model = BertForSequenceClassification config = BertConfig.from_pretrained(model_name_or_path, num_labels=len(labellist)) model = model.from_pretrained(model_name_or_path, config=config) # *****模型加载到设备***** if torch.cuda.is_available(): # 单GPU计算 torch.cuda.set_device(0) device = torch.device('cuda', 0) # 设置GPU设备号 else: device = torch.device('cpu') model.to(device) #*****优化函数***** t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warmup_steps = int(t_total * args.warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) #*****训练过程相关信息***** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) #*****开始训练***** tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything(args.seed) for num in range(args.num_train_epochs): train_all_steps = 0 train_steps = [] train_losses = [] global_step = 0 logger.info(f'****************Train epoch-{num}****************') pbar = ProgressBar(n_total=len(train_dataloader), desc='Train') for step, batch in enumerate(train_dataloader): #***存储step用于绘制Loss曲线*** train_all_steps += 1 train_steps.append(train_all_steps) model.train() #***输入模型进行计算*** batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model( **inputs) #模型原文件中已经使用损失函数对输出值和标签值进行了计算,返回的outputs中包含损失函数值 #***损失函数值反向传播*** loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #梯度裁剪 #***存储loss用于绘制loss曲线*** train_losses.append(loss.detach().cpu().numpy()) #***优化器进行优化*** pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() #优化器优化 scheduler.step() #学习率机制更新 model.zero_grad() global_step += 1 #训练一个epoch保存一个模型 output_dir = os.path.join(args.output_dir, f'model_checkpoint_epoch_{num}') if not os.path.exists(output_dir): os.makedirs(output_dir) print('') #避免输出信息都在同一行 # logger.info(f'save model checkpoint-{global_step} to {output_dir} ') model.save_pretrained(output_dir) #保存模型 #***训练一个epoch绘制一个Loss曲线*** trainloss.train_loss(steps=train_steps, losses=train_losses, epoch=num, args=args, type='train', max_step=train_all_steps) #*****一个epoch训练结束以后,进行验证***** print('') logger.info(f'****************Valid epoch-{num}****************') logger.info(" Num examples = %d", len(valid_data)) logger.info(" Batch size = %d", args.valid_batch_size) valid_steps, valid_losses, valid_all_steps = valid( args=args, model=model, device=device, valid_data=valid_data, valid_dataloader=valid_dataloader) trainloss.train_loss(steps=valid_steps, losses=valid_losses, epoch=num, args=args, type='valid', max_steps=valid_all_steps) #每训练一个epoch清空cuda缓存 if 'cuda' in str(device): torch.cuda.empty_cache()