def 报告属性(self , 和弦集合): report = [] for 名 , 和弦组成音 , 低音 in 和弦集合: if isinstance(和弦组成音 , str): 和弦 = self.构造和弦(和弦组成音 , 低音) else: 和弦 = self.生成和弦(和弦组成音 , 低音) 五度进行 = "" if self.存在属进行(和弦): 五度进行 = "属" elif self.存在下属进行(和弦): 五度进行 = "下属" 三全音 = "V" if self.存在三全音(和弦) else "" 半音关系_三 = self.存在半音关系(和弦 , 主和弦类型 = "三") 半音关系_七 = self.存在半音关系(和弦 , 主和弦类型 = "七") 导音 = "V" if self.存在导音(和弦) else "" report.append( [str(名) , str(五度进行) , str(三全音) , str(导音) , str(半音关系_三) , str(半音关系_七)] ) with open("a.txt" , "a+") as fil: fil.write(self.名 + "\n") for x in report: for y in x: fil.write(str(y) + ",") fil.write("\n") report = beautiful_str(["名" , "五度进行" , "三全音" , "导音" , "半音关系(主3)" , "半音关系(主7)"] , report) return self.名 + report
def compare(C, logger, dataset, models_1, models_2, generator): #----- determine some arguments and prepare model ----- bert_type = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(bert_type) golden = write_keyfile(dataset, generator) models_1 = models_1.eval() models_2 = models_2.eval() batch_size = 8 batch_numb = (len(dataset) // batch_size) + int( (len(dataset) % batch_size) != 0) #----- gene ----- readable_info = "" json_info = [] all_generated_1 = "" all_generated_2 = "" for batch_id in tqdm(range(batch_numb), ncols=70, desc="Generating..."): #----- get data ----- data = dataset[batch_id * batch_size:(batch_id + 1) * batch_size] sents, ents, anss, data_ent = get_data_from_batch(data, device=tc.device( C.device)) with tc.no_grad(): preds_1 = models_1(sents, ents, output_preds=True) preds_2 = models_2(sents, ents, output_preds=True) #----- get generated output ----- ans_rels = [[(u, v) for u, v, t in bat] for bat in anss] if C.gene_in_data else None generated_1 = generator(preds_1, data_ent, ans_rels=ans_rels, split_generate=True) generated_2 = generator(preds_2, data_ent, ans_rels=ans_rels, split_generate=True) all_generated_1 += "".join(generated_1) all_generated_2 += "".join(generated_2) for text_id in range(len(data)): #----- form data structure ----- # text tmp_sents = sents[text_id] while tmp_sents[-1] == 0: # remove padding tmp_sents = tmp_sents[:-1] text = tokenizer.decode(tmp_sents[1:-1]) # entitys tmp_ents = ents[text_id] for i in range(len(tmp_ents)): tmp_ents[i].append( tokenizer.decode(tmp_sents[tmp_ents[i][0]:tmp_ents[i][1]])) tmp_ents[i][0] = len( tokenizer.decode(tmp_sents[1:tmp_ents[i][0]]) ) + 1 #前方的字符数(+1 is for space) tmp_ents[i][1] = len( tokenizer.decode(tmp_sents[1:tmp_ents[i][1]])) #前方的字符数 tmp_ents[i] = [i] + tmp_ents[i] # golden answer tmp_anss = anss[text_id] for i in range(len(tmp_anss)): tmp_anss[i][2] = relations[tmp_anss[i][2]] golden_ans = tmp_anss # model 1 output got_ans_1 = [] for x in list( filter(lambda x: x, generated_1[text_id].strip().split("\n"))): if x == "": continue reg = "(.*)\\(.*\\.(\\d*)\\,.*\\.(\\d*)(.*)\\)" rel_type, u, v, rev = re.findall(reg, x)[0] assert (not rev) or (rev == ",REVERSE") if rev: u, v = v, u got_ans_1.append([int(u) - 1, int(v) - 1, rel_type]) got_ans_2 = [] for x in list( filter(lambda x: x, generated_2[text_id].strip().split("\n"))): if x == "": continue reg = "(.*)\\(.*\\.(\\d*)\\,.*\\.(\\d*)(.*)\\)" rel_type, u, v, rev = re.findall(reg, x)[0] assert (not rev) or (rev == ",REVERSE") if rev: u, v = v, u got_ans_2.append([int(u) - 1, int(v) - 1, rel_type]) tmp_ents_s = beautiful_str(["id", "l", "r", "content"], tmp_ents) if (not C.gene_in_data) or (not C.gene_no_rel): golden_ans_s = beautiful_str( ["ent 0 id", "ent 1 id", "relation type"], golden_ans) got_ans_1_s = beautiful_str( ["ent 0 id", "ent 1 id", "relation type"], got_ans_1) got_ans_2_s = beautiful_str( ["ent 0 id", "ent 1 id", "relation type"], got_ans_2) readable_info += "text-%d:\n%s\n\nentitys:%s\n\ngolden relations:%s\n\nmodel output-1:%s\n\noutput-1:%s\n\n\n" % ( batch_id * batch_size + text_id + 1, text, tmp_ents_s, golden_ans_s, got_ans_1_s, got_ans_2_s) json_info.append({ "text-id": batch_id * batch_size + text_id + 1, "text": text, "entitys": intize(tmp_ents, [0, 1, 2]), "golden_ans": intize(golden_ans, [0, 1]), "got_ans_1": intize(got_ans_1, [0, 1]), "got_ans_2": intize(got_ans_2, [0, 1]), }) else: #ensure there are exactly the same entity pairs in gold and generated try: assert [x[:2] for x in golden_ans] == [x[:2] for x in got_ans_1] assert [x[:2] for x in golden_ans] == [x[:2] for x in got_ans_2] except AssertionError: pdb.set_trace() all_ans = [] for _ins_i in range(len(golden_ans)): all_ans.append([ golden_ans[_ins_i][0], golden_ans[_ins_i][1], golden_ans[_ins_i][2], got_ans_1[_ins_i][2], got_ans_2[_ins_i][2], ]) all_ans_s = beautiful_str( ["ent 0 id", "ent 1 id", "golden", "model 1", "model 2"], all_ans) readable_info += "text-%d:\n%s\n\nentitys:%s\n\noutputs:%s\n\n\n" % ( text_id + 1, text, tmp_ents_s, all_ans_s, ) json_info.append({ "text-id": batch_id * batch_size + text_id + 1, "text": text, "entitys": intize(tmp_ents, [0, 1, 2]), "relations": intize(all_ans, [0, 1]), }) os.makedirs(os.path.dirname(C.gene_file), exist_ok=True) with open(C.gene_file + ".txt", "w", encoding="utf-8") as fil: fil.write(readable_info) with open(C.gene_file + ".json", "w", encoding="utf-8") as fil: json.dump(json_info, fil) print("score (model 1): %.4f %.4f" % get_f1(golden, all_generated_1, is_file_content=True, no_rel_name=generator.get_no_rel_name())) print("score (model 2): %.4f %.4f" % get_f1(golden, all_generated_2, is_file_content=True, no_rel_name=generator.get_no_rel_name()))
def generate_output(C, logger, dataset, models, generator): #----- determine some arguments and prepare model ----- bert_type = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(bert_type) if models is not None: if isinstance(models, tc.nn.Module): models = [models] for i in range(len(models)): models[i] = models[i].eval() batch_size = 8 batch_numb = (len(dataset) // batch_size) + int( (len(dataset) % batch_size) != 0) device = tc.device(C.device) readable_info = "" model_output = [] dataset_info = [] all_generated = "" #----- gene ----- #dataset = dataset[:5] pbar = tqdm(range(batch_numb), ncols=70) generated = "" for batch_id in pbar: #----- get data ----- data = dataset[batch_id * batch_size:(batch_id + 1) * batch_size] sents, ents, anss, data_ent = get_data_from_batch(data, device=tc.device( C.device)) if models is not None: with tc.no_grad(): preds = [0 for _ in range(len(models))] for i, model in enumerate(models): old_device = next(model.parameters()).device model = model.to(device) preds[i] = model(sents, ents) model = model.to(old_device) #如果他本来在cpu上,生成完之后还是把他放回cpu #----- get generated output ----- ans_rels = [[(u, v) for u, v, t in bat] for bat in anss] if C.gene_in_data else None generated, pred = generator(preds, data_ent, ans_rels=ans_rels, give_me_pred=True, split_generate=True) all_generated += "".join(generated) for text_id in range(len(data)): #----- form data structure ----- # text tmp_sents = sents[text_id] while tmp_sents[-1] == 0: # remove padding tmp_sents = tmp_sents[:-1] text = tokenizer.decode(tmp_sents[1:-1]) # entitys tmp_ents = ents[text_id] for i in range(len(tmp_ents)): tmp_ents[i].append( tokenizer.decode(tmp_sents[tmp_ents[i][0]:tmp_ents[i][1]])) tmp_ents[i][0] = len( tokenizer.decode(tmp_sents[1:tmp_ents[i][0]]) ) + 1 #前方的字符数(+1 is for space) tmp_ents[i][1] = len( tokenizer.decode(tmp_sents[1:tmp_ents[i][1]])) #前方的字符数 tmp_ents[i] = [i] + tmp_ents[i] # golden answer tmp_anss = anss[text_id] for i in range(len(tmp_anss)): tmp_anss[i][2] = relations[tmp_anss[i][2]] golden_ans = tmp_anss # model output if models is not None: got_ans = [] for x in list( filter(lambda x: x, generated[text_id].strip().split("\n"))): if x == "": continue reg = "(.*)\\(.*\\.(\\d*)\\,.*\\.(\\d*)(.*)\\)" rel_type, u, v, rev = re.findall(reg, x)[0] assert (not rev) or (rev == ",REVERSE") if rev: u, v = v, u got_ans.append([int(u) - 1, int(v) - 1, rel_type]) if models is not None: tmp_pred = pred[text_id] for u, v, _ in got_ans: model_output.append({ "doc_id": text_id + 1, "ent0_id": u, "ent1_id": v, "list_of_prob": [float(x) for x in tmp_pred[u][v]], }) dataset_info.append({ "doc_id": text_id + 1, "text": text, "entity_set": [[int(idx), int(l), int(r), cont] for idx, l, r, cont in tmp_ents], "list_of_relations": [[x[0], x[1], relations.index(x[2])] for x in golden_ans], }) tmp_ents = beautiful_str(["id", "l", "r", "content"], tmp_ents) golden_ans = beautiful_str(["ent0 id", "ent1 id", "relation type"], golden_ans) if models is not None: got_ans = beautiful_str( ["ent0 id", "ent1 id", "relation type"], got_ans) else: got_ans = "None" readable_info += "text-%d:\n%s\n\nentitys:%s\n\ngolden relations:%s\n\nmodel(edge-aware) output:%s\n\n\n" % ( text_id + 1, text, tmp_ents, golden_ans, got_ans) pbar.set_description_str("(Generate)") os.makedirs(os.path.dirname(C.gene_file), exist_ok=True) with open(C.gene_file + ".txt", "w", encoding="utf-8") as fil: fil.write(readable_info) with open(C.gene_file + ".generate.txt", "w", encoding="utf-8") as fil: fil.write(all_generated) with open(C.gene_file + ".model.json", "w", encoding="utf-8") as fil: json.dump(model_output, fil) with open(C.gene_file + ".dataset.json", "w", encoding="utf-8") as fil: json.dump(dataset_info, fil)
def gene_golden(C , logger , dataset , generator ): #----- determine some arguments and prepare model ----- bert_type = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(bert_type) batch_size = 8 batch_numb = (len(dataset) // batch_size) + int((len(dataset) % batch_size) != 0) #----- gene ----- readable_info = "" json_info = [] for batch_id in tqdm(range(batch_numb) , ncols = 70 , desc = "Generating..."): #----- get data ----- data = dataset[batch_id * batch_size:(batch_id+1) * batch_size] sents , ents , anss , data_ent = get_data_from_batch(data, device=tc.device(C.device)) for text_id in range(len(data)): #----- form data structure ----- # text tmp_sents = sents[text_id] while tmp_sents[-1] == 0: # remove padding tmp_sents = tmp_sents[:-1] text = tokenizer.decode(tmp_sents[1:-1]) # entitys tmp_ents = ents[text_id] for i in range(len(tmp_ents)): tmp_ents[i].append(tokenizer.decode(tmp_sents[tmp_ents[i][0] : tmp_ents[i][1]])) tmp_ents[i][0] = len(tokenizer.decode(tmp_sents[1:tmp_ents[i][0]]))+1 #前方的字符数(+1 is for space) tmp_ents[i][1] = len(tokenizer.decode(tmp_sents[1:tmp_ents[i][1]])) #前方的字符数 tmp_ents[i] = [i] + tmp_ents[i] # golden answer tmp_anss = anss[text_id] for i in range(len(tmp_anss)): tmp_anss[i][2] = relations[tmp_anss[i][2]] golden_ans = tmp_anss tmp_ents_s = beautiful_str(["id" , "l" , "r" , "content"] , tmp_ents) golden_ans_s = beautiful_str(["ent 0 id" , "ent 1 id" , "relation"] , golden_ans) readable_info += "text-%d:\n%s\n\nentitys:%s\n\noutputs:%s\n\n\n" % ( batch_id*batch_size+text_id+1 , text , tmp_ents_s , golden_ans_s , ) json_info.append({ "text-id" : batch_id*batch_size+text_id+1 , "text" : text , "entitys" : intize(tmp_ents , [0,1,2]) , "relations" : intize(golden_ans , [0,1]) , }) os.makedirs(os.path.dirname(C.gene_file) , exist_ok = True) with open(C.gene_file + ".txt" , "w" , encoding = "utf-8") as fil: fil.write(readable_info) with open(C.gene_file + ".json" , "w" , encoding = "utf-8") as fil: json.dump(json_info , fil)