def extract_text_conj(start_id, end_id, type_kind): array_len = 50000 print(type_kind) # 保存 conj pa pb pab conj_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32) conj_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32) conj_p_a_pos_wei.fill(0) conj_p_b_pos_wei.fill(0) conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32) conj_p_a_b_pos_wei.fill(0) with open('dict_date/str_id_' + 'word_count.file', 'rb') as f: str_id_word_count = pickle.load(f) print(len(str_id_word_count)) # for word in str_id_word_count: # print(word, str_id_word_count[word]) # 定义需要存储的字典 # 句内计算的变量 count_conj = 0 for count in range(start_id, end_id): if count == 6214: continue doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl" print(doc_path) date = pd.read_pickle(doc_path) doc = date['doc'] for book in doc: # 此处是一个文本 last_setence = [] last_congju = [] for sen in book: total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \ extract_API.parse_setence(sen) conj_p_a_pos_wei, conj_p_b_pos_wei = \ extract_API.cal_intra_conj_wei_speci_ab(sen, total_clause_level, conj_p_a_pos_wei, conj_p_b_pos_wei, str_id_word_count) count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei_speci(sen, total_clause_level, conj_p_a_b_pos_wei, str_id_word_count, count_conj) if count % 10 == 0: print(count, 'count_conj:', count_conj) if count % 1000 == 0: np.save("database/conj_p_a_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_pos_wei) np.save("database/conj_p_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_b_pos_wei) np.save("database/conj_p_a_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_b_pos_wei) print_time() print(count, '安全保存') print(count, '---' * 45) np.save("database/conj_p_a_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_pos_wei) np.save("database/conj_p_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_b_pos_wei) np.save("database/conj_p_a_b_pos_wei_speci" + str(start_id) + ".npy", conj_p_a_b_pos_wei) print_time() print('安全保存') print('---' * 45)
def extract_text(start_id, end_id, type_kind): array_len = 50000 print(type_kind) advcl_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32) conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32) inter_level_p_a_b_pos = np.zeros(shape=(array_len, array_len), dtype=np.float32) if type_kind == 'advcl_p_a_b_pos_wei': advcl_p_a_b_pos_wei.fill(0) elif type_kind == 'conj_p_a_b_pos_wei': conj_p_a_b_pos_wei.fill(0) elif type_kind == 'inter_level_p_a_b_pos': inter_level_p_a_b_pos.fill(0) with open('dict_date/str_id_' + 'word_count.file', 'rb') as f: str_id_word_count = pickle.load(f) with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f: str_id_word_count = pickle.load(f) print(len(str_id_word_count)) # for word in str_id_word_count: # print(word, str_id_word_count[word]) # 定义需要存储的字典 # 句内计算的变量 count_advcl = 0 count_conj = 0 count_inter = 0 for count in range(start_id, end_id): if count == 6214: continue doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl" print(doc_path) date = pd.read_pickle(doc_path) doc = date['doc'] for book in doc: # 此处是一个文本 last_setence = [] last_congju = [] for sen in book: total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \ extract_API.parse_setence(sen) # 句内关系 if type_kind == 'advcl_p_a_b_pos_wei': count_advcl, advcl_p_a_b_pos_wei = extract_API.cal_intra_advcl_wei(sen, total_clause_level, advcl_p_a_b_pos_wei, str_id_word_count, count_advcl) elif type_kind == 'conj_p_a_b_pos_wei': count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei(sen, total_clause_level, conj_p_a_b_pos_wei, str_id_word_count, count_conj) elif type_kind == 'inter_level_p_a_b_pos': count_inter, inter_level_p_a_b_pos, last_setence, last_congju = \ extract_API.cal_p_a_b_inter_num_weight_dis(sen, total_clause, inter_level_p_a_b_pos, str_id_word_count, last_setence, last_congju, count_inter) if count % 10 == 0: print(count, 'count_advcl:', count_advcl, 'count_conj:', count_conj, 'count_inter:', count_inter) if count % 1000 == 0: if type_kind == 'advcl_p_a_b_pos_wei': np.save("database/advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei) elif type_kind == 'conj_p_a_b_pos_wei': np.save("database/conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei) elif type_kind == 'inter_level_p_a_b_pos': np.save("database/inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos) print_time() print(count, '安全保存') print(count, '---' * 45) if type_kind == 'advcl_p_a_b_pos_wei': np.save("database/advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei) elif type_kind == 'conj_p_a_b_pos_wei': np.save("database/conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei) elif type_kind == 'inter_level_p_a_b_pos': np.save("database/inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos) print_time() print('安全保存') print('---' * 45)
def extract_text_up(start_id, end_id, type_kind): print('extract_text_up') stopkey = [w.strip() for w in codecs.open('stop_word.txt', 'r', encoding='utf-8').readlines()] # print(len(stopkey)) stopkey = set(stopkey) array_len = 50000 print(type_kind) advcl_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32) conj_p_a_b_pos_wei = np.zeros(shape=(array_len, array_len), dtype=np.float32) inter_level_p_a_b_pos = np.zeros(shape=(array_len, array_len), dtype=np.float32) inter_level_p_a_b_pos_weight = np.zeros(shape=(array_len, array_len), dtype=np.float32) if type_kind == 'advcl_p_a_b_pos_wei': advcl_p_a_b_pos_wei.fill(0) elif type_kind == 'conj_p_a_b_pos_wei': conj_p_a_b_pos_wei.fill(0) elif type_kind == 'inter_level_p_a_b_pos': inter_level_p_a_b_pos.fill(0) elif type_kind == 'inter_level_p_a_b_pos_weight': inter_level_p_a_b_pos_weight.fill(0) # with open('dict_date/str_id_' + 'word_count.file', 'rb') as f: # str_id_word_count = pickle.load(f) with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f: str_id_word_count = pickle.load(f) print(len(str_id_word_count)) # for word in str_id_word_count: # print(word, str_id_word_count[word]) count_advcl = 0 count_conj = 0 count_inter = 0 count_inter_weight = 0 for count in range(start_id, end_id): if count == 6214: continue doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl" print(doc_path) date = pd.read_pickle(doc_path) doc = date['doc'] for book in doc: # 此处是一个文本 last_setence = [] last_congju = [] for sen in book: total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \ extract_API.parse_setence(sen) # 句内关系 if type_kind == 'advcl_p_a_b_pos_wei': count_advcl, advcl_p_a_b_pos_wei = extract_API.cal_intra_advcl_wei_up(sen, total_clause_level, advcl_p_a_b_pos_wei, str_id_word_count, count_advcl, stopkey) elif type_kind == 'conj_p_a_b_pos_wei': count_conj, conj_p_a_b_pos_wei = extract_API.cal_intra_conj_wei_up(sen, total_clause_level, conj_p_a_b_pos_wei, str_id_word_count, count_conj, stopkey) elif type_kind == 'inter_level_p_a_b_pos': count_inter, inter_level_p_a_b_pos, last_setence, last_congju = \ extract_API.cal_p_a_b_inter_num_weight_dis_up(sen, total_clause, inter_level_p_a_b_pos, str_id_word_count, last_setence, last_congju, count_inter, stopkey) elif type_kind == 'inter_level_p_a_b_pos_weight': count_inter_weight, inter_level_p_a_b_pos_weight, last_setence, last_congju = \ extract_API.cal_p_a_b_inter_num_weight_dis_weight_up(sen, total_clause, inter_level_p_a_b_pos_weight, str_id_word_count, last_setence, last_congju, count_inter, stopkey) if count % 10 == 0: print(count, 'count_advcl:', count_advcl, 'count_conj:', count_conj, 'count_inter:', count_inter, 'count_inter_weight', count_inter_weight) if count % 1000 == 0: if type_kind == 'advcl_p_a_b_pos_wei': np.save("database/up_advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei) elif type_kind == 'conj_p_a_b_pos_wei': np.save("database/up_conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei) elif type_kind == 'inter_level_p_a_b_pos': np.save("database/up_inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos) elif type_kind == 'inter_level_p_a_b_pos_weight': np.save("database/up_inter_level_p_a_b_pos_weight" + str(start_id) + ".npy", inter_level_p_a_b_pos_weight) print_time() print(count, '安全保存') print(count, '---' * 45) if type_kind == 'advcl_p_a_b_pos_wei': np.save("database/up_advcl_p_a_b_pos_wei" + str(start_id) + ".npy", advcl_p_a_b_pos_wei) elif type_kind == 'conj_p_a_b_pos_wei': np.save("database/up_conj_p_a_b_pos_wei" + str(start_id) + ".npy", conj_p_a_b_pos_wei) elif type_kind == 'inter_level_p_a_b_pos': np.save("database/up_inter_level_p_a_b_pos" + str(start_id) + ".npy", inter_level_p_a_b_pos) elif type_kind == 'inter_level_p_a_b_pos_weight': np.save("database/up_inter_level_p_a_b_pos_weight" + str(start_id) + ".npy", inter_level_p_a_b_pos_weight) print_time() print('安全保存') print('---' * 45)
def extract_text(start_id, end_id): array_len = 50000 advcl_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32) advcl_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32) conj_p_a_pos_wei = np.zeros(shape=array_len, dtype=np.float32) conj_p_b_pos_wei = np.zeros(shape=array_len, dtype=np.float32) inter_level_p_a_pos = np.zeros(shape=array_len, dtype=np.float32) inter_level_p_b_pos = np.zeros(shape=array_len, dtype=np.float32) advcl_p_a_pos_wei.fill(0) advcl_p_b_pos_wei.fill(0) conj_p_a_pos_wei.fill(0) conj_p_b_pos_wei.fill(0) inter_level_p_a_pos.fill(0) inter_level_p_b_pos.fill(0) print(len(advcl_p_a_pos_wei)) with open('dict_date/str_id_' + 'word_count.file', 'rb') as f: str_id_word_count = pickle.load(f) with open('dict_date/str_id_' + 'word_count_delete.file', 'rb') as f: str_id_word_count = pickle.load(f) print(len(str_id_word_count)) # for word in str_id_word_count: # print(word, str_id_word_count[word]) count_advcl = 0 count_conj = 0 count_inter = 0 for count in range(start_id, end_id): if count == 6214: continue doc_path = '../../icwsm09stories_date/spacy_data/' + str(count) + ".pkl" print(doc_path) date = pd.read_pickle(doc_path) doc = date['doc'] for book in doc: # 此处是一个文本 last_setence = [] last_congju = [] for sen in book: # print([token.text for token in sen]) # print('word'.rjust(11), 'word lemma'.rjust(11), 'word pos'.rjust(11), 'relationship'.rjust(12), # 'father'.rjust(11), 'id'.rjust(3), '子节点') # for token in sen: # print(token.text.rjust(11), token.lemma_.rjust(11), token.pos_.rjust(11), token.dep_.rjust(12), # str(token.head).rjust(11), str(token.id).rjust(3), token.child, token.left, token.right, # token.ancestor) # 通过广度遍历获得从句之间的关系 # 平均耗时2s total_clause, total_clause_str, total_clause_level, total_clause_level_str, len_total_congju = \ extract_API.parse_setence(sen) # print(total_clause) advcl_p_a_pos_wei, advcl_p_b_pos_wei = \ extract_API.cal_intra_advcl_wei_ab(sen, total_clause_level, advcl_p_a_pos_wei, advcl_p_b_pos_wei, str_id_word_count) conj_p_a_pos_wei, conj_p_b_pos_wei = \ extract_API.cal_intra_conj_wei_ab(sen, total_clause_level, conj_p_a_pos_wei, conj_p_b_pos_wei, str_id_word_count) inter_level_p_a_pos, inter_level_p_b_pos, last_setence, last_congju = \ extract_API.cal_p_a_b_inter_num_weight_dis_ab(sen, total_clause, inter_level_p_a_pos, inter_level_p_b_pos, last_setence, last_congju, str_id_word_count) # print(len(inter_p_a_pos)) if count % 1000 == 0: np.save("database/advcl_p_a_pos_wei" + str(start_id) + ".npy", advcl_p_a_pos_wei) np.save("database/advcl_p_b_pos_wei" + str(start_id) + ".npy", advcl_p_b_pos_wei) np.save("database/conj_p_a_pos_wei" + str(start_id) + ".npy", conj_p_a_pos_wei) np.save("database/conj_p_b_pos_wei" + str(start_id) + ".npy", conj_p_b_pos_wei) np.save("database/inter_level_p_a_pos" + str(start_id) + ".npy", inter_level_p_a_pos) np.save("database/inter_level_p_b_pos" + str(start_id) + ".npy", inter_level_p_b_pos) print_time() print(count, '安全保存') print(count, '---' * 45) np.save("database/advcl_p_a_pos_wei" + str(start_id) + ".npy", advcl_p_a_pos_wei) np.save("database/advcl_p_b_pos_wei" + str(start_id) + ".npy", advcl_p_b_pos_wei) np.save("database/conj_p_a_pos_wei" + str(start_id) + ".npy", conj_p_a_pos_wei) np.save("database/conj_p_b_pos_wei" + str(start_id) + ".npy", conj_p_b_pos_wei) np.save("database/inter_level_p_a_pos" + str(start_id) + ".npy", inter_level_p_a_pos) np.save("database/inter_level_p_b_pos" + str(start_id) + ".npy", inter_level_p_b_pos) print_time() print('安全保存') print('---' * 45)