def cal_mcs(pkl_dir, mcs_dir, is_front, key_word, lap=1): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g2 = mcs(g2, util.get_nw(nw_list[ii - k])) k += 1 while ii > 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成连通子图 g1 = mcs(g2, g1) # 生成文件名字 filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.pkl' # 2016.9.20 用于测试,保存结果 util.save_nw( g1, 'D://semantic analysis//nalyze_data//result//过程结果//连通子图//' + filename) if is_front: # 计算比例,1,2 跟1比 pr = mcs_ratio(g1, g1, key_word) record_list.append(nw_list[jj][0:-4] + '\t' + str(pr)) else: # 计算比例,1,2 跟2比 pr = mcs_ratio(g1, g2, key_word) record_list.append(nw_list[jj][0:-4] + '\t' + str(pr)) num_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) g2 = g1
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\新结果\共现网络//' txt_dir = r"D:\semantic analysis\新纯文本\1常用词//" # k_list = util.get_key_list() # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好'] # 中心词 k_list = ['美好'] # 结巴分词词典的目录 # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt") # jieba.initialize() pynlpir.open() for key in k_list: print(key) pynlpir.nlpir.AddUserWord(c_char_p(key.encode())) for key in k_list: print(key) # 文件目录 file_list = util.get_file_list(txt_dir + key, ".txt") # 建立目录 mk_dir(result_dir + key) # mk_dir(result_dir+key+'//w') mk_dir(result_dir + key + '//p') for n_file in file_list: s_list = util.get_list_from_file(txt_dir + key + "//" + n_file) # 过滤相同的语句,防止重复计算 print(len(s_list)) s_list = list(set(s_list)) print(len(s_list)) # 生成所有句子的网络 # ps_list, mn, pps_list,pmn = create_matrix(s_list,key) pps_list, pmn = create_matrix(s_list, key) pkl_name = n_file[:-4] + '.pkl' # for w_list in ps_list: # # 创建整句话的网络 # mn.add_edges(w_list) # util.save_nw(mn.get_network(), result_dir+key+'//w//' + pkl_name) for w_list in pps_list: # pmn.add_edges(w_list) pmn.add_gram_edges(w_list) util.save_nw(pmn.get_network(), result_dir + key + '//p//' + pkl_name) print(n_file) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with open(result_dir + key + '//record.txt', 'a', encoding='utf-8') as rf: rf.write(n_file + '\n') pynlpir.close()
def main(): k_list = util.get_key_list() jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt") jieba.initialize() for key in k_list: print(key) file_list = util.get_file_list( 'D://semantic analysis//analyze_data//fc//' + key, ".txt") # 建立目录 mk_dir('./w') mk_dir('./p') for n_file in file_list: s_list = util.get_list_from_file(n_file) # 过滤相同的语句,防止重复计算 print(len(s_list)) s_list = list(set(s_list)) print(len(s_list)) wg = nx.Graph() pg = nx.Graph() for sentence in s_list: # 创建整句话的网络 ll = util.input_filer(sentence) wg = add_s2g(wg, ' '.join(ll)) # 只创建关键词所在的分句的网络 for ss in ll: if (key in ss): pg = add_s2g(pg, ss) pkl_name = n_file[:-4] + '.pkl' util.save_nw(pg, './/p//' + pkl_name) util.save_nw(wg, './/w//' + pkl_name) print(n_file) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with open('record.txt', 'a', encoding='utf-8') as rf: rf.write(n_file + '\n')
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 while (ii - lap + 1) >= 0: # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成文件名字 filename = nw_list[ii][0:-4] + '.pkl' # 保存结果 pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//" util.create_directory(pkl_dir) util.save_nw(g1, pkl_dir + nw_list[ii][0:-4]) num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) ii -= lap # util.save_file(mcs_dir + key_word+'mcs.txt', record_list) util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list) util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\常用词的分词集合1//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list) - 1: s1 = util.get_nw(pkl_list[i + 1]) util.save_nw( s & s1, r"D:\semantic analysis\常用词的分词集合1//" + keyword + "//" + pkl_list[i]) s = s1 i += 1
import tool.util as util import os # key_list = util.get_key_list2() # # for keyword in key_list: # print(keyword) # dirr = 'D:\semantic analysis\分词网络\pNet2\\' + keyword + '//p//' # os.chdir(dirr) # pkl_list = util.get_file_list(dirr, '.pkl') # util.create_directory(r"D:\semantic analysis\常用词的分词集合//"+keyword) # for pkl in pkl_list: # g = util.get_nw(pkl) # s = set(g.nodes()) # util.save_nw(s,r"D:\semantic analysis\常用词的分词集合//"+keyword+"//"+pkl) # 测试数据 util.save_nw(set(["我们","你们","他们","怎么","天气","很好","哈哈"]),r"D:\semantic analysis\测试//1.pkl") util.save_nw(set(["基本","数据","文章","集合","天气","很好","哈哈"]),r"D:\semantic analysis\测试//2.pkl") util.save_nw(set(["重复","你们","他们","怎么","天气","消除","元素"]),r"D:\semantic analysis\测试//3.pkl") util.save_nw(set(["我们","排序","他们","转换","类型","很好","哈哈"]),r"D:\semantic analysis\测试//4.pkl") util.save_nw(set(["危机","为何","他们","转换","不会","友情","哈哈"]),r"D:\semantic analysis\测试//5.pkl")
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\2016-10-05结果//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list): s1 = util.get_nw(pkl_list[i]) | s s2 = s1 - s print(len(s2)) util.save_nw( s2, r"D:\semantic analysis\2016-10-05结果//" + keyword + "//" + pkl_list[i]) s = s1 i += 1