def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//' txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//" set_dir = r"D:\semantic analysis\2016-10-05结果\新词//" k_list = util.get_key_list() for key in k_list: print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_list = sorted(util.get_file_list(set_dir + key, ".pkl")) util.create_directory(result_dir + "新词//" + key + "//") i = 0 while i < len(file_list): s_list = util.get_list_from_file(txt_dir + key + "//" + set_list[i][0:-4] + ".txt") new_word_list = util.get_nw(set_dir + key + "//" + set_list[i]) # 过滤相同的语句,防止重复计算 s_list = list(set(s_list)) w_list = remark(s_list, new_word_list, key) html_name = file_list[i][:-4] + '.html' util.save_file(result_dir + "新词//" + key + "//" + html_name, w_list) i += 1
def loop_compare(keyword_list, pkl_dir1, txt_dir1, result_dir, mode=1, lap=1): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) txt_dir = txt_dir1.format(key) # 获取日期列表 d_list = util.get_file_list(pkl_dir, '.pkl') d_list = [d.split(".")[0] for d in d_list] result_list = [] # 升序排序 d_list = sorted(d_list) ii = len(d_list) - 1 while ii - lap >= 0: g1 = get_core_graph(pkl_dir + d_list[ii] + ".pkl") d1 = get_txt_dict(txt_dir + d_list[ii] + ".txt") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(d_list[ii - k])) k += 1 result_list.append(compare_function(d1, g1)) ii -= lap util.save_file(result_dir + key + ".txt", result_list)
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: global keyword keyword = key print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) # k += 1 ii -= lap g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 # k = 1 # while k < lap: # g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) # k += 1 # 生成连通子图 # 相互比例 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) # 一对一 elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) # n对一 elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def main1(): # date_list = ["2012-08-05","2011-04-05","2011-03-28","2011-10-20","2012-12-30","2011-07-30","2011-06-09","2012-02-05","2012-12-16","2011-08-01","2011-05-19","2013-09-01","2012-08-01","2013-12-01"] # key_list = ["吐槽","纠结","淡定","自拍","正能量","山寨","达人","腹黑","接地气","扯淡","闷骚","不明觉厉","完爆","人艰不拆"] date_list = [ "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31" ] key_list = [ '努力', '感觉', '简单', '无聊', '希望', '美好', '气质', '害怕', '喜欢', '不约而同', '喜闻乐见', ] # 设置结果保存的目录 result_dir = r'D:\semantic analysis\2016-10-09结果\html标记结果//' txt_dir = r"D:\semantic analysis\纯文本\常用词分句//" set_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//" i = 0 while i < len(key_list): key = key_list[i] print(key) # 文件目录 file_list = sorted(util.get_file_list(txt_dir + key, ".txt")) # 集合目录 set_dir_list = util.get_file_list(set_dir + key, ".pkl") set_list = [] for set_list_dir in set_dir_list: set_list.append(util.get_nw(set_dir + key + "//" + set_list_dir)) print(set_list_dir) util.create_directory(result_dir + key + "//") rr = cal_index2(date_list[i], txt_dir + key_list[i]) j = 0 # 每个分段 while j < len(rr): k = 0 while k < rr[j]: print(file_list[k][:-4]) print(rr[j]) txt_list = util.get_list_from_file(txt_dir + key + "//" + file_list[k]) w_list = remark(txt_list, set_list[j], key) html_name = file_list[k][:-4] + '.html' util.save_file(result_dir + key + "//" + html_name, w_list) k += 1 j += 1 i += 1
def loop_compare(com_function, keyword_list, pkl_dir1, result_dir, mode=1, lap=1, type="pkl"): for key in keyword_list: print(key) if mode == 0: util.create_directory(result_dir + key + "//") pkl_dir = pkl_dir1.format(key) f_list = util.get_file_list(pkl_dir, '.txt') os.chdir(pkl_dir) result_list = [] # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 while ii - 2 * lap >= 0: g2 = util.txt2dict(util.get_list_from_file(nw_list[ii])) # 迭代生成子图 k = 1 while k < lap: g2 = nx.compose(g2, util.get_nw(nw_list[ii - k])) k += 1 ii -= lap g1 = util.txt2dict(util.get_list_from_file(nw_list[ii])) d1 = util.get_nw( "D:\semantic analysis\新结果\去虚词去单字共现网络//{0}//p//".format(key) + nw_list[ii].split(".")[0] + ".pkl") # 迭代生成子图 k = 1 while k < lap: g1 = nx.compose(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成连通子图 if mode == 1: r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) result_list.append((nw_list[ii][0:-4] + "\t" + str(r2))) elif mode == 0: result_list = com_function(copy.deepcopy(g1), copy.deepcopy(g2)) util.save_file( result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt", result_list) elif mode == 2: r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), d1) # result_list.append(str(r1)) result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1)) ii -= lap if mode != 0: result_list.reverse() util.save_file(result_dir + key + ".txt", result_list)
def main(): # 设置结果保存的目录 result_dir = r'D:\semantic analysis\新结果\去虚词去单字共现网络//' txt_dir = r"D:\semantic analysis\新纯文本\1常用词//" # k_list = util.get_key_list() # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好'] # 中心词 k_list = ['希望', '气质', '害怕', '喜欢'] # 结巴分词词典的目录 # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt") # jieba.initialize() pynlpir.open() for key in k_list: pynlpir.nlpir.AddUserWord(c_char_p(key.encode())) for key in k_list: print(key) # 文件目录 file_list = util.get_file_list(txt_dir+key, ".txt") # 建立目录 util.create_directory(result_dir + key) # mk_dir(result_dir+key+'//w') util.create_directory(result_dir+key+'//p') for n_file in file_list: s_list = util.get_list_from_file(txt_dir+key+"//"+n_file) # 过滤相同的语句,防止重复计算 print(len(s_list)) s_list = list(set(s_list)) print(len(s_list)) # 生成所有分句的网络 pps_list,pmn = create_matrix(s_list,key) pkl_name = n_file[:-4] + '.pkl' for w_list in pps_list: pmn.add_gram_edges(w_list) g = pmn.get_network() g.remove_edges_from(g.selfloop_edges()) util.save_nw(g, result_dir+key+'//p//' + pkl_name) print(n_file) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with open(result_dir+key+'//record.txt','a',encoding='utf-8') as rf: rf.write(n_file+'\n') pynlpir.close()
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2): f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) record_list = [] num_list = [] enum_list = [] ii = len(nw_list) - 1 while (ii - lap + 1) >= 0: # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 迭代生成子图 k = 1 while k < lap: g1 = mcs(g1, util.get_nw(nw_list[ii - k])) k += 1 # 生成文件名字 filename = nw_list[ii][0:-4] + '.pkl' # 保存结果 pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//" util.create_directory(pkl_dir) util.save_nw(g1, pkl_dir + nw_list[ii][0:-4]) num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes())) enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges())) # 统计节点数 # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file: # for node in g1.nodes(): # file.write(node+'\n') # util.save_nw(g1,mcs_dir + filename) ii -= lap # util.save_file(mcs_dir + key_word+'mcs.txt', record_list) util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list) util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
def loop_key2(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = 0 # g2是2号 g1是1号,此处获取最末端的网络 g1 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii < len(nw_list) - lap: ii += lap g2 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '.txt' result_list = extract_new_nodes_attributes(g1, g2) util.save_file(result_dir + key_word + "//" + filename, result_list) g1 = nx.compose(g1, g2)
def loop_key(pkl_dir, result_dir, key_word, lap=1): pkl_dir = pkl_dir.format(key_word) f_list = util.get_file_list(pkl_dir, '.pkl') os.chdir(pkl_dir) # 升序排序 nw_list = sorted(f_list) ii = len(nw_list) - 1 # g2是2号 g1是1号,此处获取最末端的网络 g2 = util.get_nw(nw_list[ii]) util.create_directory(result_dir + key_word) while ii > 0: jj = ii ii -= lap # print(nw_list[ii]) g1 = util.get_nw(nw_list[ii]) # 生成文件名字 filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.txt' result_list = cal_connect_real_probability(g1, g2, key_word) util.save_file(result_dir + key_word + "//" + filename, result_list) g2 = g1
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\常用词的分词集合1//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list) - 1: s1 = util.get_nw(pkl_list[i + 1]) util.save_nw( s & s1, r"D:\semantic analysis\常用词的分词集合1//" + keyword + "//" + pkl_list[i]) s = s1 i += 1
"2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31" ] key_list = util.get_key_list2() k = 0 for key in key_list: print(key) dir = "D:\semantic analysis\常用词的分词集合//" # index_list = cal_index2(date_list[k], dir+key) index_list = [100, 125, 150] print(index_list) k += 1 file_list = util.get_file_list(dir + key, ".pkl") set_list = [] # 获取目录下所有set集合 os.chdir(dir + key) for file in file_list: set_list.append(util.get_nw(file)) # print(len(set_list)) rd_list, r_list = cal_difference(index_list, set_list) r_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//" util.create_directory(r_dir + key) i = 0 while i < len(rd_list): print(len(rd_list[i])) print(len(r_list[i])) print(len(rd_list[i]) / len(r_list[i])) # util.save_nw(r_set, r_dir+key+"//"+str(index_list[i]).zfill(3)+".pkl") i += 1
import tool.util as util root_path = r"D:\semantic analysis\用户信息\dict//" save_root_path = r"D:\semantic analysis\用户信息\s_dict//" def combine(src_path, save_path): file_list_dict, file_name_list = util.get_objdict_list(src_path, ".txt") for file_name, file_dict in file_list_dict.items(): r_dict = dict() for place, num in file_dict.items(): p_place = place.split(" ")[0] r_dict[p_place] = r_dict.get(p_place, 0) + num util.save_dict_list(r_dict,save_path+file_name) py_list = ["tc","zp","dd","sz","dr","ms","fh","znl"] for py in py_list: util.create_directory(save_root_path+py+"//") combine(root_path+py+"//",save_root_path+py+"//")
import tool.util as util import os key_list = util.get_key_list2() for keyword in key_list: print(keyword) dirr = 'D:\semantic analysis\常用词的分词集合\\' + keyword os.chdir(dirr) pkl_list = util.get_file_list(dirr, '.pkl') pkl_list = sorted(pkl_list) util.create_directory(r"D:\semantic analysis\2016-10-05结果//" + keyword) i = 0 s = util.get_nw(pkl_list[0]) while i < len(pkl_list): s1 = util.get_nw(pkl_list[i]) | s s2 = s1 - s print(len(s2)) util.save_nw( s2, r"D:\semantic analysis\2016-10-05结果//" + keyword + "//" + pkl_list[i]) s = s1 i += 1
import tool.util as util # 从每个dict——txt文件里面统计词频率 dict_path = r"D:\semantic analysis\结果\去重频数//" result_path = r"D:\semantic analysis\结果\去重频率//" keyword_list = util.get_key_list2() + util.get_key_list() for key in keyword_list: print(key) r_dict, file_name_list = util.get_objdict_list(dict_path + key, ".txt") for (k, word_dict) in r_dict.items(): sum = 0 r_f_dict = {} if key in word_dict: word_dict.pop(key) for word, value in word_dict.items(): sum += int(value) for word, value in word_dict.items(): ratio = value / sum r_f_dict[word] = ratio util.create_directory(result_path + key + "//") util.save_dict_list(r_f_dict, result_path + key + "//" + k)
for key in key_word: print(key) pynlpir.nlpir.AddUserWord(c_char_p(key.encode())) result_dir = r"D:\semantic analysis\新结果\去重去虚词去单字词频数//" fold_list_dir = r"D:\semantic analysis\新纯文本\1常用词分句//" for key in key_word: print(key) file_list = sorted(util.get_file_list(fold_list_dir + key, ".txt")) # 循环文件 for txt_file in file_list: print(txt_file) # 过滤重复 s_list = set( util.get_list_from_file(fold_list_dir + key + "//" + txt_file)) # 获取分词dict rr = count_word(s_list, key) # if "无力" in rr: # print(rr["无力"]/rr["吐槽"]) # 对key进行排序 kk = sort_by_value(rr) w_list = create_dict_list(kk, rr) # 创建目录 util.create_directory(result_dir + key) util.save_file(result_dir + key + "//" + txt_file, w_list, False) # 关闭分词工具 pynlpir.close()
"2011-10-16", "2012-02-16", "2012-07-09", "2012-11-19"] ] py_list = ["zp","dd","sz","dr","ms","fh"] date_list_list = [["2010-06-01", "2011-01-21", "2011-07-09", "2011-11-21"]] py_list = ["tc"] root_path = r"D:\semantic analysis\用户信息\dict//" i = 0 while i < len(py_list): py = py_list[i] date_list = date_list_list[i] i += 1 for dd in date_list: user_id_list = extract_user_id(py,dd) place_list = [] place_dict = dict() if user_id_list: for user_id in user_id_list: place = get_place(user_id) if place: place_list.append(place) place_dict = dict((a, place_list.count(a)) for a in place_list) util.create_directory(root_path+py) util.save_dict_list(place_dict, root_path+py+"//"+dd+".txt")
import re import tool.util as util key_list = util.get_key_list() dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//" for key in key_list: print(key) set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt") date_list = util.get_file_list(dict_dir + key, ".txt") pattern = re.compile(r"(\d*-\d*)-\d*") month_array = pattern.findall(" ".join(date_list)) month_array = ["2010"] util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key) # 循环查找月份 for month in month_array: pattern = re.compile(r"(" + month + "-\d*-\d*)") date_array = pattern.findall(" ".join(date_list)) print(date_array) # 循环合并月份频数字典 r_dict = dict() for file_date in date_array: r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict) util.save_dict_list( r_dict, r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
db=db_name, charset='UTF8') cur = conn.cursor() sql_str = "select content from {0} where date = '{1}';".format( py_keyword, date_str) print(sql_str) cur.execute(sql_str) with open(date_str + '.txt', 'w', encoding='utf8') as w_file: for c in cur: w_file.write(c[0] + '\n' + '\n') cur.close() #关闭游标 conn.close() #关闭到数据库的连接,释放数据库资源 # key_list = ['完爆', '扯淡', '接地气', '正能量', '腹黑', '达人', '闷骚'] import time key_list = ["喜欢"] os.chdir(dirr) for key_word in key_list: util.create_directory(key_word) # os.mkdir(key_word) for key_word in key_list: with open(r"D:\semantic analysis\新纯文本\1新词/date/" + key_word, "r") as file: date_list = file.readlines() for date in date_list: create_txt(key_word, date.strip())