예제 #1
0
def extract_text_conj(start_id, end_id, type_kind):
    array_len = 50000
    print(type_kind)
    # 保存 conj pa pb pab
    conj_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    conj_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    conj_p_a_pos_wei.fill(0)
    conj_p_b_pos_wei.fill(0)
    conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    conj_p_a_b_pos_wei.fill(0)
    with open('dict_date/str_id_' + 'word_count.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    print(len(str_id_word_count))
    # for word in str_id_word_count:
    #     print(word, str_id_word_count[word])
    # 定义需要存储的字典
    # 句内计算的变量
    count_conj = 0
    for count in range(start_id, end_id):
        if count == 6214:
            continue
        doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl"
        print(doc_path)
        date = pd.read_pickle(doc_path)
        doc = date['doc']
        for book in doc:
            # 此处是一个文本
            last_setence = []
            last_congju = []
            for sen in book:
                total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \
                    extract_API.parse_setence(sen)
                conj_p_a_pos_wei, conj_p_b_pos_wei = \
                    extract_API.cal_intra_conj_wei_speci_ab(sen, total_clause_level, conj_p_a_pos_wei, conj_p_b_pos_wei,
                                                            str_id_word_count)

                count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei_speci(sen, total_clause_level,
                                                                                      conj_p_a_b_pos_wei,
                                                                                      str_id_word_count, count_conj)
        if count % 10 == 0:
            print(count, 'count_conj:', count_conj)
        if count % 1000 == 0:
            np.save("database/conj_p_a_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_pos_wei)
            np.save("database/conj_p_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_b_pos_wei)
            np.save("database/conj_p_a_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
            print_time()
            print(count, '安全保存')
            print(count, '---' * 45)
    np.save("database/conj_p_a_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_pos_wei)
    np.save("database/conj_p_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_b_pos_wei)
    np.save("database/conj_p_a_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
    print_time()
    print('安全保存')
    print('---' * 45)
예제 #2
0
def extract_text(start_id, end_id, type_kind):
    array_len = 50000
    print(type_kind)
    advcl_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    inter_level_p_a_b_pos = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    if type_kind == 'advcl_p_a_b_pos_wei':
        advcl_p_a_b_pos_wei.fill(0)
    elif type_kind == 'conj_p_a_b_pos_wei':
        conj_p_a_b_pos_wei.fill(0)
    elif type_kind == 'inter_level_p_a_b_pos':
        inter_level_p_a_b_pos.fill(0)

    with open('dict_date/str_id_' + 'word_count.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    print(len(str_id_word_count))
    # for word in str_id_word_count:
    #     print(word, str_id_word_count[word])

    # 定义需要存储的字典
    # 句内计算的变量
    count_advcl = 0
    count_conj = 0
    count_inter = 0
    for count in range(start_id, end_id):
        if count == 6214:
            continue
        doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl"
        print(doc_path)
        date = pd.read_pickle(doc_path)
        doc = date['doc']
        for book in doc:
            # 此处是一个文本
            last_setence = []
            last_congju = []
            for sen in book:
                total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \
                    extract_API.parse_setence(sen)

                # 句内关系
                if type_kind == 'advcl_p_a_b_pos_wei':
                    count_advcl, advcl_p_a_b_pos_wei = extract_API.cal_intra_advcl_wei(sen, total_clause_level,
                                                                                       advcl_p_a_b_pos_wei,
                                                                                       str_id_word_count, count_advcl)
                elif type_kind == 'conj_p_a_b_pos_wei':
                    count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei(sen, total_clause_level,
                                                                                    conj_p_a_b_pos_wei,
                                                                                    str_id_word_count,
                                                                                    count_conj)
                elif type_kind == 'inter_level_p_a_b_pos':
                    count_inter, inter_level_p_a_b_pos, last_setence, last_congju = \
                        extract_API.cal_p_a_b_inter_num_weight_dis(sen, total_clause, inter_level_p_a_b_pos,
                                                                   str_id_word_count, last_setence, last_congju,
                                                                   count_inter)
        if count % 10 == 0:
            print(count, 'count_advcl:', count_advcl, 'count_conj:', count_conj, 'count_inter:', count_inter)
        if count % 1000 == 0:
            if type_kind == 'advcl_p_a_b_pos_wei':
                np.save("database/advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei)
            elif type_kind == 'conj_p_a_b_pos_wei':
                np.save("database/conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
            elif type_kind == 'inter_level_p_a_b_pos':
                np.save("database/inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos)
            print_time()
            print(count, '安全保存')
            print(count, '---' * 45)

    if type_kind == 'advcl_p_a_b_pos_wei':
        np.save("database/advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei)
    elif type_kind == 'conj_p_a_b_pos_wei':
        np.save("database/conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
    elif type_kind == 'inter_level_p_a_b_pos':
        np.save("database/inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos)
    print_time()
    print('安全保存')
    print('---' * 45)
예제 #3
0
def extract_text_up(start_id, end_id, type_kind):
    print('extract_text_up')
    stopkey = [w.strip() for w in codecs.open('stop_word.txt', 'r', encoding='utf-8').readlines()]
    # print(len(stopkey))
    stopkey = set(stopkey)
    array_len = 50000
    print(type_kind)
    advcl_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    inter_level_p_a_b_pos = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    inter_level_p_a_b_pos_weight = np.zeros(shape=(array_len, array_len), dtype=np.float32)
    if type_kind == 'advcl_p_a_b_pos_wei':
        advcl_p_a_b_pos_wei.fill(0)
    elif type_kind == 'conj_p_a_b_pos_wei':
        conj_p_a_b_pos_wei.fill(0)
    elif type_kind == 'inter_level_p_a_b_pos':
        inter_level_p_a_b_pos.fill(0)
    elif type_kind == 'inter_level_p_a_b_pos_weight':
        inter_level_p_a_b_pos_weight.fill(0)

    # with open('dict_date/str_id_' + 'word_count.file', 'rb') as f:
    #     str_id_word_count = pickle.load(f)
    with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    print(len(str_id_word_count))
    # for word in str_id_word_count:
    #     print(word, str_id_word_count[word])

    count_advcl = 0
    count_conj = 0
    count_inter = 0
    count_inter_weight = 0
    for count in range(start_id, end_id):
        if count == 6214:
            continue
        doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl"
        print(doc_path)
        date = pd.read_pickle(doc_path)
        doc = date['doc']
        for book in doc:
            # 此处是一个文本
            last_setence = []
            last_congju = []
            for sen in book:
                total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \
                    extract_API.parse_setence(sen)

                # 句内关系
                if type_kind == 'advcl_p_a_b_pos_wei':
                    count_advcl, advcl_p_a_b_pos_wei = extract_API.cal_intra_advcl_wei_up(sen, total_clause_level,
                                                                                          advcl_p_a_b_pos_wei,
                                                                                          str_id_word_count,
                                                                                          count_advcl, stopkey)
                elif type_kind == 'conj_p_a_b_pos_wei':
                    count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei_up(sen, total_clause_level,
                                                                                       conj_p_a_b_pos_wei,
                                                                                       str_id_word_count,
                                                                                       count_conj, stopkey)
                elif type_kind == 'inter_level_p_a_b_pos':
                    count_inter, inter_level_p_a_b_pos, last_setence, last_congju = \
                        extract_API.cal_p_a_b_inter_num_weight_dis_up(sen, total_clause, inter_level_p_a_b_pos,
                                                                      str_id_word_count, last_setence, last_congju,
                                                                      count_inter, stopkey)
                elif type_kind == 'inter_level_p_a_b_pos_weight':
                    count_inter_weight, inter_level_p_a_b_pos_weight, last_setence, last_congju = \
                        extract_API.cal_p_a_b_inter_num_weight_dis_weight_up(sen, total_clause,
                                                                             inter_level_p_a_b_pos_weight,
                                                                             str_id_word_count, last_setence,
                                                                             last_congju,
                                                                             count_inter, stopkey)
        if count % 10 == 0:
            print(count, 'count_advcl:', count_advcl, 'count_conj:', count_conj, 'count_inter:', count_inter,
                  'count_inter_weight', count_inter_weight)
        if count % 1000 == 0:
            if type_kind == 'advcl_p_a_b_pos_wei':
                np.save("database/up_advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei)
            elif type_kind == 'conj_p_a_b_pos_wei':
                np.save("database/up_conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
            elif type_kind == 'inter_level_p_a_b_pos':
                np.save("database/up_inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos)
            elif type_kind == 'inter_level_p_a_b_pos_weight':
                np.save("database/up_inter_level_p_a_b_pos_weight" + str(start_id) + ".npy",
                        inter_level_p_a_b_pos_weight)
            print_time()
            print(count, '安全保存')
            print(count, '---' * 45)

    if type_kind == 'advcl_p_a_b_pos_wei':
        np.save("database/up_advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei)
    elif type_kind == 'conj_p_a_b_pos_wei':
        np.save("database/up_conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei)
    elif type_kind == 'inter_level_p_a_b_pos':
        np.save("database/up_inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos)
    elif type_kind == 'inter_level_p_a_b_pos_weight':
        np.save("database/up_inter_level_p_a_b_pos_weight" + str(start_id) + ".npy", inter_level_p_a_b_pos_weight)
    print_time()
    print('安全保存')
    print('---' * 45)
def extract_text(start_id, end_id):
    array_len = 50000
    advcl_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    advcl_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    conj_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    conj_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32)
    inter_level_p_a_pos = np.zeros(shape=array_len, dtype=np.float32)
    inter_level_p_b_pos = np.zeros(shape=array_len, dtype=np.float32)
    advcl_p_a_pos_wei.fill(0)
    advcl_p_b_pos_wei.fill(0)
    conj_p_a_pos_wei.fill(0)
    conj_p_b_pos_wei.fill(0)
    inter_level_p_a_pos.fill(0)
    inter_level_p_b_pos.fill(0)
    print(len(advcl_p_a_pos_wei))

    with open('dict_date/str_id_' + 'word_count.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f:
        str_id_word_count = pickle.load(f)
    print(len(str_id_word_count))
    # for word in str_id_word_count:
    #     print(word, str_id_word_count[word])
    count_advcl = 0
    count_conj = 0
    count_inter = 0

    for count in range(start_id, end_id):
        if count == 6214:
            continue
        doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl"
        print(doc_path)
        date = pd.read_pickle(doc_path)
        doc = date['doc']
        for book in doc:
            # 此处是一个文本

            last_setence = []
            last_congju = []

            for sen in book:
                # print([token.text for token in sen])
                # print('word'.rjust(11), 'word lemma'.rjust(11), 'word pos'.rjust(11), 'relationship'.rjust(12),
                #       'father'.rjust(11), 'id'.rjust(3), '子节点')
                # for token in sen:
                #     print(token.text.rjust(11), token.lemma_.rjust(11), token.pos_.rjust(11), token.dep_.rjust(12),
                #           str(token.head).rjust(11), str(token.id).rjust(3), token.child, token.left, token.right,
                #           token.ancestor)

                # 通过广度遍历获得从句之间的关系
                # 平均耗时2s
                total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \
                    extract_API.parse_setence(sen)
                # print(total_clause)
                advcl_p_a_pos_wei, advcl_p_b_pos_wei = \
                    extract_API.cal_intra_advcl_wei_ab(sen, total_clause_level, advcl_p_a_pos_wei, advcl_p_b_pos_wei, str_id_word_count)
                conj_p_a_pos_wei, conj_p_b_pos_wei = \
                    extract_API.cal_intra_conj_wei_ab(sen, total_clause_level, conj_p_a_pos_wei, conj_p_b_pos_wei, str_id_word_count)

                inter_level_p_a_pos, inter_level_p_b_pos, last_setence, last_congju = \
                    extract_API.cal_p_a_b_inter_num_weight_dis_ab(sen, total_clause, inter_level_p_a_pos,
                                                                  inter_level_p_b_pos, last_setence, last_congju, str_id_word_count)
        # print(len(inter_p_a_pos))

        if count % 1000 == 0:
            np.save("database/advcl_p_a_pos_wei" + str(start_id) + ".npy", advcl_p_a_pos_wei)
            np.save("database/advcl_p_b_pos_wei" + str(start_id) + ".npy", advcl_p_b_pos_wei)
            np.save("database/conj_p_a_pos_wei" + str(start_id) + ".npy", conj_p_a_pos_wei)
            np.save("database/conj_p_b_pos_wei" + str(start_id) + ".npy", conj_p_b_pos_wei)
            np.save("database/inter_level_p_a_pos" + str(start_id) + ".npy", inter_level_p_a_pos)
            np.save("database/inter_level_p_b_pos" + str(start_id) + ".npy", inter_level_p_b_pos)
            print_time()
            print(count, '安全保存')
            print(count, '---' * 45)
    np.save("database/advcl_p_a_pos_wei" + str(start_id) + ".npy", advcl_p_a_pos_wei)
    np.save("database/advcl_p_b_pos_wei" + str(start_id) + ".npy", advcl_p_b_pos_wei)
    np.save("database/conj_p_a_pos_wei" + str(start_id) + ".npy", conj_p_a_pos_wei)
    np.save("database/conj_p_b_pos_wei" + str(start_id) + ".npy", conj_p_b_pos_wei)
    np.save("database/inter_level_p_a_pos" + str(start_id) + ".npy", inter_level_p_a_pos)
    np.save("database/inter_level_p_b_pos" + str(start_id) + ".npy", inter_level_p_b_pos)
    print_time()
    print('安全保存')
    print('---' * 45)