def test_cut_off_result(raw_file, cut_file): raw_file_l = read_jsonline(raw_file) cut_file_l = read_jsonline(cut_file) for i in range(0, len(raw_file_l)): raw_sentences = raw_file_l[i]["sentences"] cut_sentences = cut_file_l[i]["sentences"] # print("raw", len(raw_sentences)) # print("cut", len(cut_sentences)) if len(cut_sentences) == 0: print(' '.join(cut_sentences))
def merge_two_jsonlines(path, path2, dest_path): file1 = read_jsonline(path) file2 = read_jsonline(path2) file1.extend(file2) tag = 0 for i in file1: i["doc_key"] = "nw" + str(tag) tag += 1 print(i["doc_key"]) print(len(file1)) write_jsonline(dest_path, file1)
def visual_gold_pred(gold_path, predicted_path): gold_file = read_jsonline(gold_path) pred_file = read_json(predicted_path) num = 0 for doc_key, pred in pred_file.items(): for gold in gold_file: if doc_key == gold['doc_key']: # print("__________________") sentences = sum(gold['sentences'], []) # print("sentences:", ' '.join(sentences)) # print("doc_key:", doc_key) for i in gold['clusters']: # print(i) print('\n', "gold:") print("len(sentences):", len(sentences)) print(i[0][0], i[0][1], i[1][0], i[1][1]) print(' '.join(sentences[i[0][0]:i[0][1] + 1]), "| | |", ' '.join(sentences[i[1][0]:i[1][1] + 1])) print("\n", 'predicted:') gold_s = i[0][0] gold_e = i[0][1] gold_s_pro = i[1][0] gold_e_pro = i[1][1] for i in pred: # print(i) print(i[0][0], i[0][1], i[1][0], i[1][1]) print(' '.join(sentences[i[0][0]:i[0][1] + 1]), '| | |', ' '.join(sentences[i[1][0]:i[1][1] + 1])) if i[0][0] == gold_s and i[0][1] == gold_e and i[1][ 0] == gold_s_pro and i[1][1] == gold_e_pro: num += 1 print("__________________") print("num:", num)
def all_file_distance(path): file_l = read_jsonline(path) dic_r = {} method, process, problem, solution = {}, {}, {}, {} for i in file_l: r = sentence_distance_gold(i) sentences = sum(i["sentences"], []) index = i["clusters"][0][1][1] word = sentences[index] if word == "solution": if r in method: method[r] += 1 else: method[r] = 1 print(method) # method = sorted(method.items(), key=lambda item:item[1], reverse=True) # print(method) x = method.keys() y = method.values() # print(plt.style.available) # plt.style.use('fivethirtyeight') # bmh plt.figure(figsize=(200, 200)) plt.bar(x, y, facecolor='lightskyblue', edgecolor='white', lw=2) x_tick = method.keys() plt.show()
def cut_off_jsonlines(path, dest_file): file_list = read_jsonline(path) new_file_list = [] for file in file_list: new_file = cut_off_sentence(file) new_file_list.append(new_file) # print(len(new_file_list)) write_jsonline(dest_file, new_file_list)
def del_empty_cluster(path, dest_path): file_list = read_jsonline(path) for file in file_list: if len(file["clusters"]) == 0: print(file["clusters"]) new_file_list = [file for file in file_list if len(file["clusters"]) != 0] write_jsonline(dest_path, new_file_list) print(len(new_file_list)) print(len(file_list))
def split_train_eval_dataset(path, train_path, eval_path): file_list = read_jsonline(path) print(len(file_list)) random.shuffle(file_list) train_nums = math.floor(len(file_list) * 0.85) train_data = file_list[:train_nums] eval_data = file_list[train_nums:] print(len(train_data)) print(len(eval_data)) write_jsonline(train_path, train_data) write_jsonline(eval_path, eval_data)
def batch_get_head(path, dest_path): file_list = read_jsonline(path) for file in file_list: new_clusters = get_entity_head(file) file["clusters"] = new_clusters print( sum(file["sentences"], [])[file["clusters"][0][0][0]:file["clusters"][0][0][1] + 1]) print( sum(file["sentences"], [])[file["clusters"][0][1][0]:file["clusters"][0][1][1] + 1]) # print(len(file_list)) write_jsonline(dest_path, file_list)
def all_file(path, dest_path, vocab_file, length): file_l = read_jsonline(path) tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) new_file = [] for i in file_l: dic = get_sub_token(i, tokenizer, length) dic = get_cls_sep(dic) dic = get_speaker(dic) dic = get_sentence_map(dic) dic = finally_get_cluster(dic) new_file.append(dic) print("_______________________") write_jsonline(dest_path, new_file)
def visual_gold_pred(gold_path, predicted_path): gold_file = read_jsonline(gold_path) pred_file = read_json(predicted_path) num = 0 for doc_key, pred in pred_file.items(): for gold in gold_file: if doc_key == gold['doc_key']: sentences = sum(gold['sentences'], []) gold_clusters = gold["clusters"] # print(sentences) # print(gold_clusters) # print(pred) if pred == []: print("\n", "No predicted:") gold_sentence = ' '.join(sentences[:gold_clusters[0][0][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][0][0]: gold_clusters[0][0][1] + 1]),'blue') + ' ' + ' '.join(sentences[gold_clusters[0][0][1] + 1: gold_clusters[0][1][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][1][0]: gold_clusters[0][1][1] + 1]), 'blue') tag = 0 for id, char in enumerate(gold_sentence): if char == '.': print(gold_sentence[tag:id]) tag = id if id == len(gold_sentence) - 1: print(gold_sentence[tag:]) print("_______________________") else: for i in pred: num += 1 print("_______________________") pred_sentence = ' '.join(sentences[:i[0][0]]) + ' ' + highlight(' '.join(sentences[i[0][0]: i[0][1] + 1]), 'red') + ' ' + ' '.join(sentences[i[0][1] + 1: i[1][0]]) + ' ' + highlight(' '.join(sentences[i[1][0]: i[1][1] + 1]), 'red') gold_sentence = ' '.join(sentences[:gold_clusters[0][0][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][0][0]: gold_clusters[0][0][1] + 1]),'blue') + ' ' + ' '.join(sentences[gold_clusters[0][0][1] + 1: gold_clusters[0][1][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][1][0]: gold_clusters[0][1][1] + 1]), 'blue') print(gold["doc_key"]) print("Gold:") tag = 0 for id, char in enumerate(gold_sentence): if char == '.': print(gold_sentence[tag:id]) tag = id if id == len(gold_sentence) - 1: print(gold_sentence[tag:]) print("\n", "Pred:") tag2 = 0 for id, char in enumerate(pred_sentence): if char == '.': print(pred_sentence[tag2:id]) tag2 = id if id == len(pred_sentence) - 1: print(pred_sentence[tag2:]) print("_______________________") print(num) print(len(gold_file))
def check(train, eval, dest): train_l = read_jsonline(train) eval_l = read_jsonline(eval) print(len(eval_l)) num = 0 new_l = [] for train in train_l: train_sentences = sum(train["sentences"], []) train_cluster = train["clusters"] t_entity_s = train_cluster[0][0][0] t_entity_e = train_cluster[0][0][1] t_pronoun_s = train_cluster[0][1][0] t_pronoun_e = train_cluster[0][1][1] for id, eval in enumerate(eval_l): eval_sentences = sum(eval["sentences"], []) eval_cluster = eval["clusters"] e_entity_s = eval_cluster[0][0][0] e_entity_e = eval_cluster[0][0][1] e_pronoun_s = eval_cluster[0][1][0] e_pronoun_e = eval_cluster[0][1][1] if train["pn"] == eval["pn"] and train_sentences[ t_entity_s:t_entity_e + 1] == eval_sentences[e_entity_s:e_entity_e + 1]: print(' '.join(eval_sentences[e_entity_s:e_entity_e + 1])) # print(eval_sentences[e_pronoun_s: e_pronoun_e + 1]) num += 1 new_l.append(id) new_eval_l = [] print(len(set(new_l))) print(new_l) print(num) for id, eval in enumerate(eval_l): if id not in new_l: new_eval_l.append(eval) print(len(new_eval_l))
def jsonlines_count(path, dest_path): jsonlines_list = read_jsonline(path) new_jsonline_list = [] a = 0 for dic in jsonlines_list: new_dic = count_sentence_entity(dic) if new_dic == 1: a += 1 if new_dic != {} and new_dic != 1: new_jsonline_list.append(new_dic) write_jsonline(dest_path, new_jsonline_list) phrases_count = len(new_jsonline_list) sentences_count = len(jsonlines_list) - phrases_count print(sentences_count / len(jsonlines_list)) print(len(jsonlines_list)) print('new', len(new_jsonline_list)) return sentences_count, phrases_count
def del_all_overlap(path, dest_path): file_l = read_jsonline(path) print(len(file_l)) num = 0 sorted_l = [] for raw_index, raw_file in enumerate(file_l): raw_sentences = sum(raw_file["sentences"], []) raw_cluster = raw_file["clusters"] r_entity_s = raw_cluster[0][0][0] r_entity_e = raw_cluster[0][0][1] r_pn = raw_file["pn"] new_l = [] for new_index, new_file in enumerate(file_l): new_sentences = sum(new_file["sentences"], []) new_cluster = new_file["clusters"] n_entity_s = new_cluster[0][0][0] n_entity_e = new_cluster[0][0][1] n_pn = new_file["pn"] if new_index != raw_index and r_pn == n_pn and new_sentences[ n_entity_s:n_entity_e] == raw_sentences[ r_entity_s:r_entity_e]: new_l.append(raw_index) new_l.append(new_index) new_l = sorted(set(new_l)) if len(new_l) > 0: sorted_l.append(new_l) # print(sorted_l) final_l = [] for i in sorted_l: if i not in final_l: final_l.append(i) final_l2 = [i[1:] for i in final_l] final_l3 = sum(final_l2, []) print(len(final_l3)) overlap_l = [] for raw_index, raw_file in enumerate(file_l): if raw_index not in final_l3: overlap_l.append(raw_file) write_jsonline(dest_path, overlap_l) print(len(overlap_l))
# print(num_count) if num_count == min(clusters): tag = id break if tag > 1: sentences = sentences[tag:] speakers = speakers[tag:] new_clusters = [(i - sentence_index_tag) for i in clusters] new_clusters = [[new_clusters[0], new_clusters[1]], [new_clusters[2], new_clusters[3]]] dic["clusters"] = [new_clusters] # print(dic["clusters"]) dic["sentences"] = sentences dic["speakers"] = speakers print(len(dic["sentences"])) print('___') return dic if __name__ == '__main__': path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/bert_256_merge_x4_z5_x1_z3.jsonlines" dest_path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/cut_bert_256_merge_x4_z5_x1_z3.jsonlines" file_list = read_jsonline(path) new = [] for dic in file_list: # print(len(dic["sentences"]), '??') dic = cut_off_sentence(dic) # print(len(dic["sentences"])) new.append(dic) write_jsonline(dest_path, new)