def setup(self, model): bert_config_path = config.pretrained_model_path + config.MODEL_PATH_MAP[ model] + "/config.json" bert_model_path = config.pretrained_model_path + config.MODEL_PATH_MAP[ model] + "/model.bin" bert_vocab_path = config.pretrained_model_path + config.MODEL_PATH_MAP[ model] + "/vocab.txt" lm_config = config.MODEL_CLASSES[model][0].from_pretrained( bert_config_path) self.lm_model = nn.DataParallel( config.MODEL_CLASSES[model][1].from_pretrained( bert_model_path, config=lm_config)).to(config.device) self.lm_tokenizer = config.MODEL_CLASSES[model][2](bert_vocab_path, do_lower_case=False) self.train_data = CustomDataset(self.train_data, self.lm_tokenizer, self.rel2id, self.num_rels) # set data loader self.train_batcher = DataLoader(self.train_data, config.batch_size, drop_last=True, shuffle=True, collate_fn=collate_wrapper) self.subject_model = nn.DataParallel(SubJectModel()).to(config.device) self.object_model = nn.DataParallel(ObjectModel(self.num_rels)).to( config.device) self.criterion = nn.BCELoss(reduction="none") self.models_params = list(self.lm_model.parameters()) + list( self.subject_model.parameters()) + list( self.object_model.parameters()) self.optimizer = torch.optim.Adam(self.models_params, lr=config.lr) self.start_step = None if config.load_weight: print("start loading weight...") state = torch.load(config.model_file_path, map_location=lambda storage, location: storage) self.lm_model.module.load_state_dict(state['lm_model']) self.object_model.module.load_state_dict(state['object_model']) self.subject_model.module.load_state_dict(state['subject_model']) self.start_step = state['step'] self.optimizer.load_state_dict(state['optimizer']) if config.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda()
def __init__(self, labyrinth_ctrl, hero_ctrl, guard_ctrl, game_engine): super().__init__(game_engine, labyrinth_ctrl) self._hero_ctrl = hero_ctrl self._guard_ctrl = guard_ctrl self._model = ObjectModel(self, ("pill", PILL_FILE), ("diluent", DILUENT_FILE), ("needle", NEEDLE_FILE)) self._view = ObjectView(self, self._model, game_engine) self.setting_collisions()
object_map = json.load(fp) object_class_num = len(object_map.keys()) # 得出object 的class num # Reads object_map. # id2object_1.json 是将标签的粒度变得更细了 id2object_map_path = os.path.join(args.data_path, "id2object_1.json") if not (os.path.exists(id2object_map_path) and os.path.isfile(id2object_map_path)): sys.exit("{} dose not exists or is not a file.".format(id2object_map_path)) with open(id2object_map_path, 'r', encoding='utf8') as fp: id2object_map = json.load(fp) roberta_base_name = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" roberta_large_name = "/pretrains/pt/clue-roberta-chinese-clue-large" model_object = ObjectModel(roberta_base_name, 768, object_class_num) if args.init_checkpoint is not None and os.path.exists(args.init_checkpoint): model_object.load_state_dict(t.load(args.init_checkpoint)) model_object = model_object.cuda() criterion = nn.CrossEntropyLoss() # 使用交叉熵计算损失 tokenizer = BertTokenizerFast.from_pretrained( "/home/lawson/pretrain/bert-base-chinese") collator = TrainSubjectDataCollator() def set_random_seed(seed): """sets random seed""" random.seed(seed) np.random.seed(seed) #t.seed(seed) # 为什么torch 也要设置这个seed ?
def do_predict(model_subject_path,model_object_path,model_relation_path): # Does predictions. logger.info("\n====================start predicting====================") logger.info("\n===============本次运行,参数配置如下:================") for k,v in (vars(args).items()): logger.info(f"{k,v}") bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese" roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num) #model_subject = SubjectModel(roberta_name_or_path,768,out_fea=subject_class_num) model_subject.load_state_dict(t.load(model_subject_path)) model_subject = model_subject.cuda() model_object = ObjectModel(bert_name_or_path,768,object_class_num) model_object = model_object.cuda() model_object.load_state_dict(t.load(model_object_path)) model_relation = RelationModel(roberta_name_or_path,relation_class_num) model_relation = model_relation.cuda() model_relation.load_state_dict(t.load(model_relation_path)) tokenizer = BertTokenizerFast.from_pretrained("/home/lawson/pretrain/bert-base-chinese") #predict_file_path = os.path.join(args.data_path, 'train_data_2_predict.json') #subject_name = model_subject_path. dev_data_path = (args.dev_data_path).split("/")[-1].split(".")[0] a = (args.model_relation_path).split("/") a = "_".join(a[-2::]) a = a.split(".")[0] predict_file_path = os.path.join(args.data_path, dev_data_path)+f"_predict_{a}.json" batch_file_path = f"/home/lawson/program/DuIE_py/data/{dev_data_path}_subject_object_relation_{a}.txt" # Loads dataset. dev_dataset = PredictSubjectDataset.from_file( #os.path.join(args.data_path, 'train_data_2.json'), args.dev_data_path, tokenizer, args.max_seq_length, True ) collator = PredictSubjectDataCollator() dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) model_subject.eval() model_object.eval() model_relation.eval() all_known_subjects = get_all_subjects(train_data_path="/home/lawson/program/DuIE_py/data/train_data.json") # 将subject的预测结果写到文件中 all_country = get_all_country(train_data_path="/home/lawson/program/DuIE_py/data/train_data.json") if os.path.exists(batch_file_path): logger.info("存在文件subject_object_relation.txt,请处理") sys.exit(0) res = [] # 最后的预测结果 invalid_num = 0 # 预测失败的个数 with t.no_grad(): for batch in tqdm(dev_data_loader): # origin_info 是原始的json格式的信息 input_ids,token_type_ids,attention_mask, batch_origin_info,offset_mapping = batch # labels size = [batch_size,max_seq_length] logits_1 = model_subject(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask ) #logits size [batch_size,max_seq_len,class_num] # 得到预测到的 subject # temp = get_rid_of_number_in_str(origin_info[0]['text']) # origin_info[0]['text'] = temp batch_subjects,batch_subject_labels = decode_subject(logits_1, id2subject_map, input_ids, tokenizer, batch_origin_info, offset_mapping, all_known_subjects ) # 将subject的预测结果写到文件中 object_invalid_num = 0 # ====== 根据origin_info 得到 subtask 2 的训练数据 ========== # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大 object_input_ids, object_token_type_ids,object_attention_mask,\ object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects, batch_origin_dict=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length, ) object_input_ids = t.tensor(object_input_ids).cuda() object_token_type_ids = t.tensor(object_token_type_ids).cuda() object_attention_mask = t.tensor(object_attention_mask).cuda() logits_2 = model_object(input_ids = object_input_ids, token_type_ids=object_token_type_ids, attention_mask=object_attention_mask ) batch_objects, batch_object_labels = decode_object( logits_2, id2object_map, tokenizer, object_input_ids, object_origin_info, object_offset_mapping, logger ) if(len(batch_objects[0]) == 0): invalid_num+=1 #print("----- 未预测到 object ----------") continue # ====== 根据 subject + object 得到 subtask 3 的测试数据 ========== relation_input_ids, relation_token_type_ids,\ relation_attention_mask, relation_labels = from_dict2_relation(batch_subjects, batch_objects, batch_origin_info, tokenizer, args.max_seq_length ) relation_input_ids = t.tensor(relation_input_ids).cuda() relation_token_type_ids = t.tensor(relation_token_type_ids).cuda() relation_attention_mask = t.tensor(relation_attention_mask).cuda() if relation_input_ids.size(0) == 0: continue # 这个模型直接得到loss out = model_relation(input_ids=relation_input_ids, token_type_ids=relation_token_type_ids, attention_mask=relation_attention_mask, labels = None ) logits = out.logits # 输出最后的分类分数 # size [batch_size, relation_class_num] batch_relations = decode_relation_class(logits,id2relation_map) # batch_relations = add_relation_of_country(batch_subjects,batch_subject_labels, # batch_objects,batch_object_labels,batch_relations,batch_origin_info) # 得到最后的结果 cur_res = post_process_2(batch_subjects, # 5 batch_objects, # 5 batch_relations, batch_origin_info ) res.extend(cur_res) # 分别写出三步的结果 with open(batch_file_path,'a') as f: a = str(batch_subjects) b = str(batch_objects) c = str(batch_relations) f.write(a+"\n") f.write(b+"\n") f.write(c+"\n") f.write("\n") # 写出最后的预测结果 with open(predict_file_path,"w",encoding="utf-8") as f: for line in res: json_str = json.dumps(line,ensure_ascii=False) #print(json_str) f.write(json_str) f.write('\n') logger.info(f"未预测到的个数是:{invalid_num}") logger.info("=====predicting complete=====")
def predict_subject_object(model_subject_path,model_object_path): # Does predictions. print("\n====================start predicting / evaluating ====================") #name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_subject = SubjectModel(subject_name_or_path,768,out_fea=subject_class_num) model_subject.load_state_dict(t.load(model_subject_path)) model_subject = model_subject.cuda() object_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_object = ObjectModel(object_name_or_path,768,object_class_num) model_object = model_object.cuda() model_object.load_state_dict(t.load(model_object_path)) tokenizer = BertTokenizerFast.from_pretrained("/home/lawson/pretrain/bert-base-chinese") # Loads dataset. dev_dataset = PredictSubjectDataset.from_file( args.dev_data_path, tokenizer, args.max_seq_length, True ) collator = PredictSubjectDataCollator() dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) model_subject.eval() model_object.eval() all_known_subjects = get_all_subjects("/home/lawson/program/DuIE_py/data/train_data.json") res = [] # 最后的预测结果 subject_invalid_num = 0 # 预测失败的个数 temp = (args.dev_data_path).split("/")[-1].split('.')[0] subject_object_predict_file = f"./{temp}_subject_object_predict.txt" if os.path.exists(subject_object_predict_file): os.remove(subject_object_predict_file) with t.no_grad(): for batch in tqdm(dev_data_loader): # origin_info 是原始的json格式的信息 input_ids,token_type_ids,attention_mask, batch_origin_info,offset_mapping = batch # labels size = [batch_size,max_seq_length] logits_1 = model_subject(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask ) #logits size [batch_size,max_seq_len,class_num] # 得到预测到的 subject # temp = get_rid_of_number_in_str(origin_info[0]['text']) # origin_info[0]['text'] = temp batch_subjects,batch_subject_labels = decode_subject(logits_1, id2subject_map, input_ids, tokenizer, batch_origin_info, offset_mapping, all_known_subjects ) logger.info("\n====================start predicting object ====================") # 将subject的预测结果写到文件中 object_invalid_num = 0 # ====== 根据origin_info 得到 subtask 2 的训练数据 ========== # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大 object_input_ids, object_token_type_ids,object_attention_mask,\ object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects, batch_origin_dict=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length, ) object_input_ids = t.tensor(object_input_ids).cuda() object_token_type_ids = t.tensor(object_token_type_ids).cuda() object_attention_mask = t.tensor(object_attention_mask).cuda() logits_2 = model_object(input_ids = object_input_ids, token_type_ids=object_token_type_ids, attention_mask=object_attention_mask ) batch_objects, batch_object_labels = decode_object( logits_2, id2object_map, tokenizer, object_input_ids, object_origin_info, object_offset_mapping, logger ) # 可视化subject + object 的预测结果 visualize_subject_object(subject_object_predict_file,batch_subjects,batch_objects) #评测 cal_subject_object_metric(subject_object_predict_file,args.dev_data_path)
def do_train_2(model_subject_path, model_object_path, model_relation_path): # Does predictions. logger.info( "\n====================start predicting / evaluating ====================" ) subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_subject = SubjectModel(subject_name_or_path, 768, out_fea=subject_class_num) model_subject.load_state_dict(t.load(model_subject_path)) model_subject = model_subject.cuda() object_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_object = ObjectModel(object_name_or_path, 768, object_class_num) model_object = model_object.cuda() model_object.load_state_dict(t.load(model_object_path)) relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_relation = RelationModel(relation_name_or_path, relation_class_num) model_relation = model_relation.cuda() model_relation.load_state_dict(t.load(model_relation_path)) tokenizer = BertTokenizerFast.from_pretrained( "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch") # Loads dataset. # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据 logger.info(f"Preprocessing data, loaded from {args.train_data_path}") train_dataset = TrainSubjectDataset.from_file(args.train_data_path, tokenizer, args.max_seq_length, True) # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch) # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据 # train_batch_sampler = DistributedSampler( # train_dataset, # shuffle=True, # drop_last=True # ) collator = TrainSubjectDataCollator() train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) model_subject.eval() viz = Visdom() win = "train_loss_negative_2" res = [] # 最后的预测结果 subject_invalid_num = 0 # 预测失败的个数 all_known_subjects = get_all_subjects( train_data_path="/home/lawson/program/DuIE_py/data/train_data.json") # 将二者模型的梯度关闭 for param in model_subject.parameters(): param.requires_grad = False for param in model_object.parameters(): param.requires_grad = False optimizer = t.optim.AdamW([ { 'params': model_relation.parameters(), 'lr': 2e-5 }, ], ) # Starts training. global_step = 0 logging_steps = 50 save_steps = 5000 step = 1 logging_loss = 0 for epoch in tqdm(range(args.num_train_epochs)): total_neg_cnt = 0 total_pos_cnt = 0 for batch in tqdm(train_data_loader): # origin_info 是原始的json格式的信息 input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch # labels size = [batch_size,max_seq_length] logits_1 = model_subject(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) #logits size [batch_size,max_seq_len,class_num] # 得到预测到的 subject # temp = get_rid_of_number_in_str(origin_info[0]['text']) # origin_info[0]['text'] = temp batch_subjects, batch_subject_labels = decode_subject( logits_1, id2subject_map, input_ids, tokenizer, batch_origin_info, offset_mapping, all_known_subjects) # 将subjects 中的元素去重 # 需要判断 batch_subjects 是空的情况,最好能够和普通subjects 一样处理 if (len(batch_subjects[0]) == 0): #print("----- 未预测到subject ----------") subject_invalid_num += 1 continue # 将subject的预测结果写到文件中 object_invalid_num = 0 # ====== 根据origin_info 得到 subtask 2 的训练数据 ========== # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大 object_input_ids, object_token_type_ids,object_attention_mask,\ object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects, batch_origin_dict=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length, ) object_input_ids = t.tensor(object_input_ids).cuda() object_token_type_ids = t.tensor(object_token_type_ids).cuda() object_attention_mask = t.tensor(object_attention_mask).cuda() logits_2 = model_object(input_ids=object_input_ids, token_type_ids=object_token_type_ids, attention_mask=object_attention_mask) batch_objects, batch_object_labels = decode_object( logits_2, id2object_map, tokenizer, object_input_ids, object_origin_info, object_offset_mapping, logger) relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels,batch_neg_cnt,batch_pos_cnt \ = get_negative_relation_data(batch_subjects,batch_objects,batch_origin_info,tokenizer,max_length=128) relation_input_ids = t.tensor(relation_input_ids).cuda() relation_token_type_ids = t.tensor(relation_token_type_ids).cuda() relation_attention_mask = t.tensor(relation_attention_mask).cuda() relation_labels = t.tensor(relation_labels).cuda() if relation_input_ids.size(0) < 1: continue logger.info( f"relation_input_ids.size(0) = {relation_input_ids.size(0)}") if relation_input_ids.size(0) > 32: out = model_relation( input_ids=relation_input_ids[0:32, :], token_type_ids=relation_token_type_ids[0:32, :], attention_mask=relation_attention_mask[0:32, :], labels=relation_labels[0:32]) logger.info(f"{batch_origin_info}") else: # 这个模型直接得到loss out = model_relation(input_ids=relation_input_ids, token_type_ids=relation_token_type_ids, attention_mask=relation_attention_mask, labels=relation_labels) loss = out.loss loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() if relation_input_ids.size(0) > 32: avg_loss = loss.item() / 32 else: avg_loss = loss.item() / relation_input_ids.size(0) logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}") if avg_loss > 2: # 重点关注一下这种损失的数据 logger.info(f"{batch_origin_info}") logging_loss += avg_loss # 打日志 if global_step % logging_steps == 0 and global_step: viz.line([logging_loss], [global_step], win=win, update="append") logging_loss = 0 # 保存模型 if global_step % save_steps == 0 and global_step != 0: logger.info( f"saving checkpoing model_relation_{513882+global_step}.pdparams to {args.output_dir}" ) cur_model_name = os.path.join( args.output_dir, "model_relation_%d_roberta.pdparams" % (513882 + global_step)) t.save(model_relation.state_dict(), cur_model_name) total_neg_cnt += batch_neg_cnt total_pos_cnt += batch_pos_cnt logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\ batch_pos_cnt ={batch_pos_cnt}\n,\ total_neg_cnt={total_neg_cnt}\n,\ total_pos_cnt={total_pos_cnt}") step += 1 global_step += 1 # 每个epoch 之后保存模型 t.save( model_relation.state_dict(), os.path.join( args.output_dir, "model_relation_%d_roberta_epoch.pdparams" % (513882 + global_step))) logger.info("\n=====training complete=====")