def __init__(self, config, model_path, label_path, bert_path='chinese-bert-wwm', max_seq_length=32): self.config = config self.model_path = model_path self.label_path = label_path self.bert_path = bert_path self.model = BERTForMultiLabelSequenceClassification( self.config, self.config.num_classes) self.model.load_state_dict( torch.load(self.model_path, map_location=torch.device('cpu'))) self.model.half() self.model.eval() self.model.to(self.config.device) self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.max_seq_length = max_seq_length self.processor = TextProcessor() self.labels = self.processor.get_labels(self.label_path) self.label2id = {label: id_ for id_, label in enumerate(self.labels)} self.id2label = {id_: label for id_, label in enumerate(self.labels)}
def __init__(self, model_path, bert_path): self.processor = TextProcessor() self.sess_options = SessionOptions() # self.sess_options.intra_op_num_threads = 1 self.sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL self.session = InferenceSession(model_path, self.sess_options) self.use_gpu = torch.cuda.is_available() self.device = torch.device("cuda:7" if use_gpu else "cpu") self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
class Predict: def __init__(self, model_path, bert_path): self.processor = TextProcessor() self.sess_options = SessionOptions() # self.sess_options.intra_op_num_threads = 1 self.sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL self.session = InferenceSession(model_path, self.sess_options) self.use_gpu = torch.cuda.is_available() self.device = torch.device("cuda:7" if use_gpu else "cpu") self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) def to_numpy(self, tensor): return tensor.detach().cpu().numpy( ) if tensor.requires_grad else tensor.cpu().numpy() def run(self, record): text_a, text_b = record[0], record[1] example = self.processor._create_single_example(text_a, text_b) feature = convert_single_example(example, self.max_seq_length, self.tokenizer) input_ids = torch.tensor(feature.input_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(feature.input_mask, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.long).unsqueeze(0) ort_inputs = { 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids } ort_outputs = self.session.run(None, ort_inputs) print(ort_outputs) print(type(ort_outputs)) def infer(self, data_path): pass
response["predict"] = predict response["index"] = index_str response["ok"] = True except Exception as e: response["predict"] = 0 response["index"] = index_str response["ok"] = False response_batch["results"].append(response) return response_batch max_seq_len = 32 config = Config('data') tokenizer = BertTokenizer.from_pretrained('./vocab') processor = TextProcessor() def to_numpy(tensor): return tensor.detach().cpu().numpy( ) if tensor.requires_grad else tensor.cpu().numpy() # 需要根据模型类型重写 def infer(bert, bert1, nezha, query_A, query_B): text_a, text_b = query_A, query_B example = processor._create_single_example(text_a, text_b) feature = convert_single_example_dynamic(example, max_seq_len, tokenizer) # feature = convert_single_example(example, max_seq_len, tokenizer)
def train(config, model): fgm = FGM(model) processor = TextProcessor() label_list = processor.get_labels(config.class_path) #加载训练数据 train_examples = processor.get_train_examples(config.train_path) train_features = convert_examples_to_features(train_examples, label_list, config.max_seq_length, config.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config.batch_size, drop_last=True) #加载测试数据 dev_examples = processor.get_dev_examples(config.dev_path) dev_features = convert_examples_to_features(dev_examples, label_list, config.max_seq_length, config.tokenizer) all_input_ids_dev = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_input_mask_dev = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_segment_ids_dev = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_label_ids_dev = torch.tensor([f.label_ids for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_input_ids_dev, all_input_mask_dev, all_segment_ids_dev, all_label_ids_dev) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=config.batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam( optimizer_grouped_parameters, lr=config.learning_rate, # schedule='warmup_linear', warmup=0.05, t_total=config.num_epochs * len(train_dataloader)) #lookahead # from optimizer import Lookahead # optimizer = Lookahead(optimizer, k=5, alpha=0.5) logger.info(f"正在使用GPU: {torch.cuda.current_device()}进行训练...") model.train() eval_steps = len(train_dataloader) // 2 for i in range(config.num_epochs): total_batch = 0 eval_best_loss = float('inf') eval_best_auc_score = float('-inf') eval_best_acc = float('-inf') last_improve = 0 flag = False for step, batch in enumerate(train_dataloader): batch = tuple(t.to(config.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) loss.backward() #对抗训练 fgm.attack() loss_adv = model(input_ids, segment_ids, input_mask, label_ids) loss_adv.backward() fgm.restore() optimizer.step() model.zero_grad() logits = model(input_ids, segment_ids, input_mask) logger.info(f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}") torch.save(model.state_dict(), config.save_path)
class Predict: def __init__(self, config, model_path, label_path, bert_path='chinese-bert-wwm', max_seq_length=32): self.config = config self.model_path = model_path self.label_path = label_path self.bert_path = bert_path self.model = BERTForMultiLabelSequenceClassification(self.config, self.config.num_classes) self.model.load_state_dict(torch.load(self.model_path, map_location=torch.device('cpu'))) self.model.eval() self.model.to(self.config.device) self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.max_seq_length = max_seq_length self.processor = TextProcessor() self.labels = self.processor.get_labels(self.label_path) self.label2id = {label: id_ for id_, label in enumerate(self.labels)} self.id2label = {id_: label for id_, label in enumerate(self.labels)} def run(self, record): ''' 预测小类标签 ''' text_a, text_b = record[0], record[1] example = self.processor._create_single_example(text_a, text_b) feature = convert_single_example(example, self.max_seq_length, self.tokenizer) input_ids = torch.tensor(feature.input_ids, dtype=torch.long).unsqueeze(0).to(self.config.device) input_mask = torch.tensor(feature.input_mask, dtype=torch.long).unsqueeze(0).to(self.config.device) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.long).unsqueeze(0).to(self.config.device) logits = self.model(input_ids, segment_ids, input_mask).detach() prob = logits.sigmoid()[:, 1].tolist() #[0.123] # prob = torch.sigmoid(logits) # return prob[0].cpu().tolist()[0] return prob[0] def collect_badcase(self, data_path): badcase = [] cnt = 0 with open(data_path, 'r', encoding='utf-8') as reader: for record in reader: print(f'第{cnt+1}条记录...') cnt += 1 text_a, text_b, label = record.strip().split('\t') pre = self.run([text_a, text_b]) if pre > 0.5: pre_label = '1' else: pre_label = '0' if pre_label != label: badcase.append('\t'.join([text_a, text_b, label, pre_label, str(pre)])) return badcase def evaluate(self, data_path): '''在全部的数据集上对模型进行测试 ''' labels = [] pres = [] cnt = 0 with open(data_path, 'r', encoding='utf-8') as reader: for record in reader: print(f'第{cnt+1}条记录...') cnt += 1 text_a, text_b, label = record.strip().split('\t') pre = self.run([text_a, text_b]) labels.append(int(label)) pres.append(pre) fpr, tpr, th = roc_curve(labels, pres, pos_label=1) auc_score = auc(fpr, tpr) return auc_score, pres def inference(self, data_path, to_path): pres = [] cnt = 0 with open(data_path, 'r', encoding='utf-8') as reader: for record in reader: print(f'第{cnt+1}条记录...') cnt += 1 text_a, text_b = record.strip().split('\t') pre = self.run([text_a, text_b]) pres.append(pre) with open(to_path, 'w', encoding='utf-8') as writer: for pre in pres: writer.write(str(pre) + '\n')
def train(config, model): fgm = FGM(model) processor = TextProcessor() label_list = processor.get_labels(config.class_path) #加载训练数据 train_examples = processor.get_train_examples(config.train_path) train_features = convert_examples_to_features(train_examples, label_list, config.max_seq_length, config.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config.batch_size, drop_last=True) #加载测试数据 dev_examples = processor.get_dev_examples(config.dev_path) dev_features = convert_examples_to_features(dev_examples, label_list, config.max_seq_length, config.tokenizer) all_input_ids_dev = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_input_mask_dev = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_segment_ids_dev = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_label_ids_dev = torch.tensor([f.label_ids for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_input_ids_dev, all_input_mask_dev, all_segment_ids_dev, all_label_ids_dev) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=config.batch_size) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=config.num_epochs * len(train_dataloader)) logger.info(f"正在使用GPU: {torch.cuda.current_device()}进行训练...") model.train() eval_steps = len(train_dataloader) // 2 for i in range(config.num_epochs): total_batch = 0 eval_best_loss = float('inf') eval_best_auc_score = float('-inf') eval_best_acc = float('-inf') last_improve = 0 flag = False for step, batch in enumerate(train_dataloader): batch = tuple(t.to(config.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) loss.backward() # #对抗训练 fgm.attack() loss_adv = model(input_ids, segment_ids, input_mask, label_ids) loss_adv.backward() fgm.restore() optimizer.step() model.zero_grad() logits = model(input_ids, segment_ids, input_mask) logger.info(f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}") if (total_batch + 1) % eval_steps == 0: torch.save(model.state_dict(), config.save_path) eval_accuracy, eval_precision, eval_loss, auc_score = evaluate( config, model, dev_dataloader) if auc_score > eval_best_auc_score: eval_best_auc_score = auc_score torch.save(model.state_dict(), config.save_path) # if eval_accuracy > eval_best_acc: # eval_best_acc = eval_accuracy # torch.save(model.state_dict(), config.save_path) print('*' * 80) logger.info( f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}, eval_loss: {eval_loss}, eval_accuracy: {eval_accuracy}, eval_precision: {eval_precision}, auc_score: {auc_score}" ) # 保存log with open('data/match/logs.txt', 'a+', encoding='utf-8') as log_writer: log_writer.write( f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}, eval_loss: {eval_loss}, eval_accuracy: {eval_accuracy}, eval_precision: {eval_precision}, auc_score: {auc_score} \n" ) model.train() total_batch += 1
class Predict: def __init__(self, model_path, bert_path): self.processor = TextProcessor() # self.sess_options = SessionOptions() # self.sess_options.intra_op_num_threads = 1 # self.sess_options.intra_op_num_threads=psutil.cpu_count(logical=True) # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # self.session = InferenceSession(model_path, self.sess_options) self.session = InferenceSession(model_path) # self.session_1 = InferenceSession(model_path_1) # 多个模型融合 # print(self.session.get_inputs()[2].name) # print(len(self.session.get_inputs())) # self.use_gpu = torch.cuda.is_available() # self.device = torch.device("cuda:7" if self.use_gpu else "cpu") self.tokenizer = BertTokenizer.from_pretrained(bert_path) def to_numpy(self, tensor): return tensor.detach().cpu().numpy( ) if tensor.requires_grad else tensor.cpu().numpy() # def to_numpy(self, tensor): # return tensor.detach().cuda().numpy() if tensor.requires_grad else tensor.cuda().numpy() def run(self, record): text_a, text_b = record[0], record[1] example = self.processor._create_single_example(text_a, text_b) feature = convert_single_example(example, 32, self.tokenizer) input_ids = torch.tensor(feature.input_ids, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(feature.input_mask, dtype=torch.long).unsqueeze(0) # ort_inputs = { # 'input_ids': self.to_numpy(input_ids), # 'segment_ids': self.to_numpy(segment_ids), # 'input_mask': self.to_numpy(input_mask) # } # print(input_ids) # print(segment_ids) # print(input_mask) ort_inputs = { 'input_ids': self.to_numpy(input_ids), 'segment_ids': self.to_numpy(segment_ids), 'input_mask': self.to_numpy(input_mask) } print(self.session) ort_outputs = self.session.run(None, ort_inputs) # print(ort_outputs) ort_logits = torch.from_numpy(ort_outputs[0]) print(ort_logits) # tensor([[4.7433, -4.5335]]) # ort_logits_1 = torch.from_numpy(ort_outputs_1[0]) # 二维向量 prob = ort_logits.sigmoid()[:, 1].tolist()[0] #[0.123] # print(ort_logits) return prob def infer(self, data_path, to_path): pres = [] cnt = 0 with open(data_path, 'r', encoding='utf-8') as reader: for record in reader: print(f'第{cnt+1}条记录...') cnt += 1 text_a, text_b = record.strip().split('\t') pre = self.run([text_a, text_b]) pres.append(pre) with open(to_path, 'w', encoding='utf-8') as writer: for pre in pres: writer.write(str(pre) + '\n')