def train_base(opt, train_feature,dev_feature=None,test_feature=None): #加载实体映射表 with open(os.path.join(opt.mid_data_dir, f'{opt.task_type}_ent2id.json'), encoding='utf-8') as f: ent2id = json.load(f) train_feature = gen_mrc_data(train_feature,ent2id,'train') dev_feature = gen_mrc_data(dev_feature, ent2id, 'dev') test_feature = gen_mrc_data(test_feature, ent2id, 'test') train_dataset = NERDataset(train_feature,opt,ent2id) dev_dataset = NERDataset(dev_feature,opt,ent2id) test_dataset = NERDataset(test_feature,opt,ent2id) if opt.task_type == 'crf': model = build_model('crf', opt.bert_dir, num_tags=len(ent2id), dropout_prob=opt.dropout_prob) elif opt.task_type == 'mrc': model = build_model('mrc', opt.bert_dir,opt, dropout_prob=opt.dropout_prob, use_type_embed=opt.use_type_embed, loss_type=opt.loss_type) else: model = build_model('span', opt.bert_dir,opt, num_tags=len(ent2id)+1, dropout_prob=opt.dropout_prob, loss_type=opt.loss_type) train(opt, model, train_dataset,dev_dataset,test_dataset,ent2id)
def train_model(train_graph_list, y_train, test_graph_list, y_test, topic): if use_tfidf: tfidf = train_tfidf(train_graph_list) else: tfidf = None if embed_retrain: tokens_list = graph_utils.extract_node_content(train_graph_list) embed_model_retrain = train_w2v.direct_train_w2v(tokens_list) # embed_model_retrain = text_utils.load_pretrain_embedding('../../../pretrain_models/cv/' + topic + '.txt') train_vector_list = [] for i, graph in enumerate(train_graph_list): if embed_retrain: train_vector_list.append( get_graph_vector(graph, tfidf, embed_type, embed_model_retrain)) else: train_vector_list.append( get_graph_vector(graph, tfidf, embed_type, embed_model)) X_train = np.vstack(train_vector_list) if use_tfidf: tfidf = train_tfidf(train_graph_list + test_graph_list) if embed_update: embed_model_update = train_w2v.update_model( embed_model_retrain, graph_utils.extract_node_content(test_graph_list)) # embed_model_update = text_utils.load_pretrain_embedding('../../../pretrain_models/cv/update_' + topic + '.txt') test_vector_list = [] for i, graph in enumerate(test_graph_list): if embed_update: test_vector_list.append( get_graph_vector(graph, tfidf, embed_type, embed_model_update)) elif embed_retrain: test_vector_list.append( get_graph_vector(graph, tfidf, embed_type, embed_model_retrain)) else: test_vector_list.append( get_graph_vector(graph, tfidf, embed_type, embed_model)) X_test = np.vstack(test_vector_list) return trainer.train(X_train, X_test, y_train, y_test, model_type)
def train_base(opt, train_examples, dev_examples=None): with open(os.path.join(opt.mid_data_dir, f'{opt.task_type}_ent2id.json'), encoding='utf-8') as f: ent2id = json.load(f) train_features = convert_examples_to_features(opt.task_type, train_examples, opt.max_seq_len, opt.bert_dir, ent2id)[0] train_dataset = NERDataset(opt.task_type, train_features, 'train', use_type_embed=opt.use_type_embed) if opt.task_type == 'crf': model = build_model('crf', opt.bert_dir, num_tags=len(ent2id), dropout_prob=opt.dropout_prob) elif opt.task_type == 'mrc': model = build_model('mrc', opt.bert_dir, dropout_prob=opt.dropout_prob, use_type_embed=opt.use_type_embed, loss_type=opt.loss_type) else: model = build_model('span', opt.bert_dir, num_tags=len(ent2id) + 1, dropout_prob=opt.dropout_prob, loss_type=opt.loss_type) train(opt, model, train_dataset) if dev_examples is not None: dev_features, dev_callback_info = convert_examples_to_features( opt.task_type, dev_examples, opt.max_seq_len, opt.bert_dir, ent2id) dev_dataset = NERDataset(opt.task_type, dev_features, 'dev', use_type_embed=opt.use_type_embed) dev_loader = DataLoader(dev_dataset, batch_size=opt.eval_batch_size, shuffle=False, num_workers=0) dev_info = (dev_loader, dev_callback_info) model_path_list = get_model_path_list(opt.output_dir) metric_str = '' max_f1 = 0. max_f1_step = 0 max_f1_path = '' for idx, model_path in enumerate(model_path_list): tmp_step = model_path.split('/')[-2].split('-')[-1] model, device = load_model_and_parallel(model, opt.gpu_ids[0], ckpt_path=model_path) if opt.task_type == 'crf': tmp_metric_str, tmp_f1 = crf_evaluation( model, dev_info, device, ent2id) elif opt.task_type == 'mrc': tmp_metric_str, tmp_f1 = mrc_evaluation( model, dev_info, device) else: tmp_metric_str, tmp_f1 = span_evaluation( model, dev_info, device, ent2id) logger.info(f'In step {tmp_step}:\n {tmp_metric_str}') metric_str += f'In step {tmp_step}:\n {tmp_metric_str}' + '\n\n' if tmp_f1 > max_f1: max_f1 = tmp_f1 max_f1_step = tmp_step max_f1_path = model_path max_metric_str = f'Max f1 is: {max_f1}, in step {max_f1_step}' logger.info(max_metric_str) metric_str += max_metric_str + '\n' eval_save_path = os.path.join(opt.output_dir, 'eval_metric.txt') with open(eval_save_path, 'a', encoding='utf-8') as f1: f1.write(metric_str) with open('./best_ckpt_path.txt', 'a', encoding='utf-8') as f2: f2.write(max_f1_path + '\n') del_dir_list = [ os.path.join(opt.output_dir, path.split('/')[-2]) for path in model_path_list if path != max_f1_path ] import shutil for x in del_dir_list: shutil.rmtree(x) logger.info('{}已删除'.format(x))
def train_base(opt, train_examples, dev_examples=None): with open(os.path.join(opt.mid_data_dir, f"{opt.task_type}_ent2id.json"), encoding="utf-8") as f: ent2id = json.load(f) train_features = convert_examples_to_features(opt.task_type, train_examples, opt.max_seq_len, opt.bert_dir, ent2id)[0] train_dataset = NERDataset(opt.task_type, train_features, "train", use_type_embed=opt.use_type_embed) print(f"len(ent2id): {len(ent2id)}") print(f"ent2id: {ent2id}") # exit(1) if opt.task_type == "crf": model = build_model("crf", opt.bert_dir, num_tags=len(ent2id), dropout_prob=opt.dropout_prob) elif opt.task_type == "mrc": model = build_model( "mrc", opt.bert_dir, dropout_prob=opt.dropout_prob, use_type_embed=opt.use_type_embed, loss_type=opt.loss_type, ) else: model = build_model( "span", opt.bert_dir, num_tags=len(ent2id) + 1, dropout_prob=opt.dropout_prob, loss_type=opt.loss_type, ) train(opt, model, train_dataset) if dev_examples is not None: dev_features, dev_callback_info = convert_examples_to_features( opt.task_type, dev_examples, opt.max_seq_len, opt.bert_dir, ent2id) dev_dataset = NERDataset(opt.task_type, dev_features, "dev", use_type_embed=opt.use_type_embed) dev_loader = DataLoader(dev_dataset, batch_size=opt.eval_batch_size, shuffle=False, num_workers=0) dev_info = (dev_loader, dev_callback_info) model_path_list = get_model_path_list(opt.output_dir) metric_str = "" max_f1 = 0.0 max_f1_step = 0 max_f1_path = "" for idx, model_path in enumerate(model_path_list): tmp_step = model_path.split("/")[-2].split("-")[-1] model, device = load_model_and_parallel(model, opt.gpu_ids[0], ckpt_path=model_path, strict=False) if opt.task_type == "crf": tmp_metric_str, tmp_f1 = crf_evaluation( model, dev_info, device, ent2id) elif opt.task_type == "mrc": tmp_metric_str, tmp_f1 = mrc_evaluation( model, dev_info, device) else: tmp_metric_str, tmp_f1 = span_evaluation( model, dev_info, device, ent2id) logger.info(f"In step {tmp_step}:\n {tmp_metric_str}") metric_str += f"In step {tmp_step}:\n {tmp_metric_str}" + "\n\n" if tmp_f1 > max_f1: max_f1 = tmp_f1 max_f1_step = tmp_step max_f1_path = model_path max_metric_str = f"Max f1 is: {max_f1}, in step {max_f1_step}" logger.info(max_metric_str) metric_str += max_metric_str + "\n" eval_save_path = os.path.join(opt.output_dir, "eval_metric.txt") with open(eval_save_path, "a", encoding="utf-8") as f1: f1.write(metric_str) with open("./best_ckpt_path.txt", "a", encoding="utf-8") as f2: f2.write(max_f1_path + "\n") del_dir_list = [ os.path.join(opt.output_dir, path.split("/")[-2]) for path in model_path_list if path != max_f1_path ] import shutil for x in del_dir_list: shutil.rmtree(x) logger.info("{}已删除".format(x))