def start(): # produce_data() model = Bert_CRF() model.load_state_dict(load_model(args.output_dir)) device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") model.to(device) print('create_iter') eval_iter = create_batch_iter("valid") print('create_iter finished') # ------------------判断CUDA模式---------------------- device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") # -----------------------验证---------------------------- model.eval() count = 0 y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = model(input_ids, segment_ids, input_mask).cpu() eval_los = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) eval_loss = eval_los + eval_loss count += 1 predicts = model.predict(bert_encode, output_mask) predict_tensor = predicts.cpu() label_tensor = label_ids.cpu() y_predicts.append(predicts) y_labels.append(label_ids) entity_precision, entity_recall, entity_f1 = score_predict( label_tensor, predict_tensor) print( '\n step :%d - eval_loss: %4f - ent_p:%4f - ent_r:%4f - ent_f1:%4f\n' % (step, eval_loss.item() / count, entity_precision, entity_recall, entity_f1)) label_ids = label_ids.view(1, -1).squeeze() predicts = predicts.view(1, -1).squeeze() label_ids = label_ids[label_ids != -1] predicts = predicts[predicts != -1] assert len(label_ids) == len(predicts) eval_predicted = torch.cat(y_predicts, dim=0).cpu() eval_labeled = torch.cat(y_labels, dim=0).cpu() entity_precision, entity_recall, entity_f1 = score_predict( eval_labeled, eval_predicted) print( '\n\n- eval_loss: %4f - eval_acc:%4f - eval_f1:%4f - ent_p:%4f - ent_r:%4f - ent_f1:%4f\n' % (eval_loss.item() / count, eval_acc, eval_f1, entity_precision, entity_recall, entity_f1))
def __init__(self): print('[INFO]加载分词器') self.processor, self.bertTokenizer = init_params() label_list = self.processor.get_labels() self.label_map = {label: i for i, label in enumerate(label_list)} self.tokenizer = BasicTokenizer() print('[INFO]分词器加载完毕') print('[INFO]加载模型') self.model = Bert_CRF() self.model.load_state_dict(load_model(args.output_dir)) self.device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") self.model.to(self.device) self.model.eval() print('[INFO]模型加载完毕')
def start(): # 优先使用缓存 if not os.path.exists(args.TRAIN) or not os.path.exists(args.VALID): produce_data(user_define=USER_DEFINE) if os.path.exists(args.TRAIN_CACHE): train_iter, num_train_steps = torch.load(args.TRAIN_CACHE) else: train_iter, num_train_steps = create_batch_iter("train") if os.path.exists(args.VALID_CACHE): eval_iter = torch.load(args.VALID_CACHE) else: eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def load_model(output_dir): # Load a trained model that you have fine-tuned output_model_file = os.path.join(output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = Bert_CRF.from_pretrained(args.bert_model, state_dict=model_state_dict) return model
def start(): parser = argparse.ArgumentParser() parser.add_argument( "--do_not_train_ernie", default=False, action='store_true', ) parser.add_argument( "--do_CRF", default=False, action='store_true', ) arg = parser.parse_args() args.do_not_train_ernie = arg.do_not_train_ernie args.do_CRF = arg.do_CRF produce_data() train_iter, num_train_steps = create_batch_iter("train") eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) if args.load_weight: model = load_model(args.output_dir) else: model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def start(): produce_data() train_iter, num_train_steps = create_batch_iter("train") eval_iter = create_batch_iter("dev") epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels)) for name, param in model.named_parameters(): if param.requires_grad: print(name) fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
def start(): # produce_data() model = Bert_CRF() print('create_iter') train_iter, num_train_steps = create_batch_iter("train") eval_iter = create_batch_iter("valid") print('create_iter finished') epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size) # for name, param in model.named_parameters(): # if param.requires_grad: # print(name) print('fit') fit(model=model, training_iter=train_iter, eval_iter=eval_iter, num_epoch=args.num_train_epochs, pbar=pbar, num_train_steps=num_train_steps, verbose=1)
class entity_extractor: def __init__(self): print('[INFO]加载分词器') self.processor, self.bertTokenizer = init_params() label_list = self.processor.get_labels() self.label_map = {label: i for i, label in enumerate(label_list)} self.tokenizer = BasicTokenizer() print('[INFO]分词器加载完毕') print('[INFO]加载模型') self.model = Bert_CRF() self.model.load_state_dict(load_model(args.output_dir)) self.device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") self.model.to(self.device) self.model.eval() print('[INFO]模型加载完毕') def extract(self, text): text = list(text) if len(text) > args.max_seq_length - 2: text = text[:(args.max_seq_length - 2)] tokens = ["[CLS]"] + text + ["[SEP]"] segment_ids = [0] * len(tokens) input_ids = convert_text_to_ids(tokens, self.bertTokenizer) input_mask = [1] * len(input_ids) padding = [0] * (args.max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == args.max_seq_length assert len(input_mask) == args.max_seq_length assert len(segment_ids) == args.max_seq_length ## output_mask用来过滤bert输出中sub_word的输出,只保留单词的第一个输出(As recommended by jocob in his paper) ## 此外,也是为了适应crf output_mask = [1 for t in text] output_mask = [0] + output_mask + [0] output_mask += padding text = ''.join(text) all_input_ids = torch.tensor([input_ids], dtype=torch.long) all_input_mask = torch.tensor([input_mask], dtype=torch.long) all_segment_ids = torch.tensor([segment_ids], dtype=torch.long) all_output_mask = torch.tensor([output_mask], dtype=torch.long) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_output_mask) sampler = SequentialSampler(data) iterator = DataLoader(data, sampler=sampler, batch_size=1) with torch.no_grad(): for step, batch in enumerate(iterator): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, output_mask = batch bert_encode = self.model(input_ids, segment_ids, input_mask).cpu() predicts = self.model.predict(bert_encode, output_mask) for ix, predict in enumerate(predicts): predict = predict[predict != -1] pre_tags = get_tags_BIESO(predict.cpu().numpy().tolist()) pre_entities = [ text[tag[0]:tag[1] + 1] for tag in pre_tags ] return set(pre_entities)