Пример #1
0
def test_cut_off_result(raw_file, cut_file):
    raw_file_l = read_jsonline(raw_file)
    cut_file_l = read_jsonline(cut_file)
    for i in range(0, len(raw_file_l)):
        raw_sentences = raw_file_l[i]["sentences"]
        cut_sentences = cut_file_l[i]["sentences"]

        # print("raw", len(raw_sentences))
        # print("cut", len(cut_sentences))
        if len(cut_sentences) == 0:
            print(' '.join(cut_sentences))
Пример #2
0
def merge_two_jsonlines(path, path2, dest_path):
    file1 = read_jsonline(path)
    file2 = read_jsonline(path2)
    file1.extend(file2)
    tag = 0
    for i in file1:
        i["doc_key"] = "nw" + str(tag)
        tag += 1
        print(i["doc_key"])
    print(len(file1))
    write_jsonline(dest_path, file1)
def visual_gold_pred(gold_path, predicted_path):
    gold_file = read_jsonline(gold_path)
    pred_file = read_json(predicted_path)
    num = 0
    for doc_key, pred in pred_file.items():
        for gold in gold_file:
            if doc_key == gold['doc_key']:

                # print("__________________")
                sentences = sum(gold['sentences'], [])
                # print("sentences:", ' '.join(sentences))
                # print("doc_key:", doc_key)
                for i in gold['clusters']:
                    # print(i)
                    print('\n', "gold:")
                    print("len(sentences):", len(sentences))
                    print(i[0][0], i[0][1], i[1][0], i[1][1])
                    print(' '.join(sentences[i[0][0]:i[0][1] + 1]), "| | |",
                          ' '.join(sentences[i[1][0]:i[1][1] + 1]))
                    print("\n", 'predicted:')
                    gold_s = i[0][0]
                    gold_e = i[0][1]
                    gold_s_pro = i[1][0]
                    gold_e_pro = i[1][1]

                for i in pred:
                    # print(i)
                    print(i[0][0], i[0][1], i[1][0], i[1][1])
                    print(' '.join(sentences[i[0][0]:i[0][1] + 1]), '| | |',
                          ' '.join(sentences[i[1][0]:i[1][1] + 1]))
                    if i[0][0] == gold_s and i[0][1] == gold_e and i[1][
                            0] == gold_s_pro and i[1][1] == gold_e_pro:
                        num += 1
                print("__________________")
    print("num:", num)
Пример #4
0
def all_file_distance(path):
    file_l = read_jsonline(path)
    dic_r = {}
    method, process, problem, solution = {}, {}, {}, {}
    for i in file_l:
        r = sentence_distance_gold(i)
        sentences = sum(i["sentences"], [])
        index = i["clusters"][0][1][1]
        word = sentences[index]
        if word == "solution":
            if r in method:
                method[r] += 1
            else:
                method[r] = 1

    print(method)
    # method = sorted(method.items(), key=lambda item:item[1], reverse=True)
    # print(method)
    x = method.keys()
    y = method.values()
    # print(plt.style.available)
    # plt.style.use('fivethirtyeight')  # bmh
    plt.figure(figsize=(200, 200))
    plt.bar(x, y, facecolor='lightskyblue', edgecolor='white', lw=2)
    x_tick = method.keys()
    plt.show()
Пример #5
0
def cut_off_jsonlines(path, dest_file):
    file_list = read_jsonline(path)
    new_file_list = []
    for file in file_list:
        new_file = cut_off_sentence(file)
        new_file_list.append(new_file)
    # print(len(new_file_list))
    write_jsonline(dest_file, new_file_list)
Пример #6
0
def del_empty_cluster(path, dest_path):
    file_list = read_jsonline(path)
    for file in file_list:
        if len(file["clusters"]) == 0:
            print(file["clusters"])
    new_file_list = [file for file in file_list if len(file["clusters"]) != 0]
    write_jsonline(dest_path, new_file_list)
    print(len(new_file_list))
    print(len(file_list))
Пример #7
0
def split_train_eval_dataset(path, train_path, eval_path):
    file_list = read_jsonline(path)
    print(len(file_list))
    random.shuffle(file_list)
    train_nums = math.floor(len(file_list) * 0.85)
    train_data = file_list[:train_nums]
    eval_data = file_list[train_nums:]
    print(len(train_data))
    print(len(eval_data))
    write_jsonline(train_path, train_data)
    write_jsonline(eval_path, eval_data)
Пример #8
0
def batch_get_head(path, dest_path):
    file_list = read_jsonline(path)
    for file in file_list:
        new_clusters = get_entity_head(file)
        file["clusters"] = new_clusters
        print(
            sum(file["sentences"],
                [])[file["clusters"][0][0][0]:file["clusters"][0][0][1] + 1])
        print(
            sum(file["sentences"],
                [])[file["clusters"][0][1][0]:file["clusters"][0][1][1] + 1])
    # print(len(file_list))
    write_jsonline(dest_path, file_list)
Пример #9
0
def all_file(path, dest_path, vocab_file, length):
    file_l = read_jsonline(path)
    tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
    new_file = []
    for i in file_l:
        dic = get_sub_token(i, tokenizer, length)
        dic = get_cls_sep(dic)
        dic = get_speaker(dic)
        dic = get_sentence_map(dic)
        dic = finally_get_cluster(dic)
        new_file.append(dic)
        print("_______________________")
    write_jsonline(dest_path, new_file)
Пример #10
0
def visual_gold_pred(gold_path, predicted_path):
    gold_file = read_jsonline(gold_path)
    pred_file = read_json(predicted_path)
    num = 0

    for doc_key, pred in pred_file.items():
        for gold in gold_file:
            if doc_key == gold['doc_key']:
                sentences = sum(gold['sentences'], [])
                gold_clusters = gold["clusters"]
                # print(sentences)
                # print(gold_clusters)
                # print(pred)
                if pred == []:
                    print("\n", "No predicted:")
                    gold_sentence = ' '.join(sentences[:gold_clusters[0][0][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][0][0]: gold_clusters[0][0][1] + 1]),'blue') + ' ' + ' '.join(sentences[gold_clusters[0][0][1] + 1: gold_clusters[0][1][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][1][0]: gold_clusters[0][1][1] + 1]), 'blue')
                    tag = 0
                    for id, char in enumerate(gold_sentence):
                        if char == '.':
                            print(gold_sentence[tag:id])
                            tag = id
                        if id == len(gold_sentence) - 1:
                            print(gold_sentence[tag:])
                    print("_______________________")

                else:
                    for i in pred:
                        num += 1
                        print("_______________________")
                        pred_sentence = ' '.join(sentences[:i[0][0]]) + ' ' + highlight(' '.join(sentences[i[0][0]: i[0][1] + 1]), 'red') + ' ' + ' '.join(sentences[i[0][1] + 1: i[1][0]]) + ' ' + highlight(' '.join(sentences[i[1][0]: i[1][1] + 1]), 'red')
                        gold_sentence = ' '.join(sentences[:gold_clusters[0][0][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][0][0]: gold_clusters[0][0][1] + 1]),'blue') + ' ' + ' '.join(sentences[gold_clusters[0][0][1] + 1: gold_clusters[0][1][0]]) + ' ' + highlight(' '.join(sentences[gold_clusters[0][1][0]: gold_clusters[0][1][1] + 1]), 'blue')
                        print(gold["doc_key"])
                        print("Gold:")
                        tag = 0
                        for id, char in enumerate(gold_sentence):
                            if char == '.':
                                print(gold_sentence[tag:id])
                                tag = id
                            if id == len(gold_sentence) - 1:
                                print(gold_sentence[tag:])
                        print("\n", "Pred:")
                        tag2 = 0
                        for id, char in enumerate(pred_sentence):
                            if char == '.':
                                print(pred_sentence[tag2:id])
                                tag2 = id
                            if id == len(pred_sentence) - 1:
                                print(pred_sentence[tag2:])
                        print("_______________________")
    print(num)
    print(len(gold_file))
def check(train, eval, dest):
    train_l = read_jsonline(train)
    eval_l = read_jsonline(eval)
    print(len(eval_l))
    num = 0
    new_l = []
    for train in train_l:
        train_sentences = sum(train["sentences"], [])
        train_cluster = train["clusters"]
        t_entity_s = train_cluster[0][0][0]
        t_entity_e = train_cluster[0][0][1]
        t_pronoun_s = train_cluster[0][1][0]
        t_pronoun_e = train_cluster[0][1][1]
        for id, eval in enumerate(eval_l):
            eval_sentences = sum(eval["sentences"], [])
            eval_cluster = eval["clusters"]
            e_entity_s = eval_cluster[0][0][0]
            e_entity_e = eval_cluster[0][0][1]
            e_pronoun_s = eval_cluster[0][1][0]
            e_pronoun_e = eval_cluster[0][1][1]
            if train["pn"] == eval["pn"] and train_sentences[
                    t_entity_s:t_entity_e +
                    1] == eval_sentences[e_entity_s:e_entity_e + 1]:
                print(' '.join(eval_sentences[e_entity_s:e_entity_e + 1]))
                # print(eval_sentences[e_pronoun_s: e_pronoun_e + 1])
                num += 1
                new_l.append(id)
    new_eval_l = []
    print(len(set(new_l)))
    print(new_l)
    print(num)
    for id, eval in enumerate(eval_l):
        if id not in new_l:
            new_eval_l.append(eval)

    print(len(new_eval_l))
Пример #12
0
def jsonlines_count(path, dest_path):
    jsonlines_list = read_jsonline(path)
    new_jsonline_list = []
    a = 0
    for dic in jsonlines_list:
        new_dic = count_sentence_entity(dic)
        if new_dic == 1:
            a += 1
        if new_dic != {} and new_dic != 1:
            new_jsonline_list.append(new_dic)
    write_jsonline(dest_path, new_jsonline_list)
    phrases_count = len(new_jsonline_list)
    sentences_count = len(jsonlines_list) - phrases_count
    print(sentences_count / len(jsonlines_list))
    print(len(jsonlines_list))
    print('new', len(new_jsonline_list))
    return sentences_count, phrases_count
Пример #13
0
def del_all_overlap(path, dest_path):
    file_l = read_jsonline(path)
    print(len(file_l))
    num = 0
    sorted_l = []
    for raw_index, raw_file in enumerate(file_l):
        raw_sentences = sum(raw_file["sentences"], [])
        raw_cluster = raw_file["clusters"]
        r_entity_s = raw_cluster[0][0][0]
        r_entity_e = raw_cluster[0][0][1]
        r_pn = raw_file["pn"]
        new_l = []
        for new_index, new_file in enumerate(file_l):
            new_sentences = sum(new_file["sentences"], [])
            new_cluster = new_file["clusters"]
            n_entity_s = new_cluster[0][0][0]
            n_entity_e = new_cluster[0][0][1]
            n_pn = new_file["pn"]
            if new_index != raw_index and r_pn == n_pn and new_sentences[
                    n_entity_s:n_entity_e] == raw_sentences[
                        r_entity_s:r_entity_e]:
                new_l.append(raw_index)
                new_l.append(new_index)
        new_l = sorted(set(new_l))
        if len(new_l) > 0:
            sorted_l.append(new_l)

    # print(sorted_l)
    final_l = []
    for i in sorted_l:
        if i not in final_l:
            final_l.append(i)
    final_l2 = [i[1:] for i in final_l]
    final_l3 = sum(final_l2, [])
    print(len(final_l3))
    overlap_l = []
    for raw_index, raw_file in enumerate(file_l):
        if raw_index not in final_l3:
            overlap_l.append(raw_file)
    write_jsonline(dest_path, overlap_l)
    print(len(overlap_l))
Пример #14
0
            # print(num_count)
            if num_count == min(clusters):
                tag = id
                break

    if tag > 1:
        sentences = sentences[tag:]
        speakers = speakers[tag:]
        new_clusters = [(i - sentence_index_tag) for i in clusters]
        new_clusters = [[new_clusters[0], new_clusters[1]],
                        [new_clusters[2], new_clusters[3]]]
        dic["clusters"] = [new_clusters]
        # print(dic["clusters"])
        dic["sentences"] = sentences
        dic["speakers"] = speakers
    print(len(dic["sentences"]))
    print('___')
    return dic


if __name__ == '__main__':
    path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/bert_256_merge_x4_z5_x1_z3.jsonlines"
    dest_path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/cut_bert_256_merge_x4_z5_x1_z3.jsonlines"
    file_list = read_jsonline(path)
    new = []
    for dic in file_list:
        # print(len(dic["sentences"]), '??')
        dic = cut_off_sentence(dic)
        # print(len(dic["sentences"]))
        new.append(dic)
    write_jsonline(dest_path, new)