Пример #1
0
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\2016-10-05结果\html标记分句2//'
    txt_dir = r"D:\semantic analysis\2016-10-05结果\新词分句//"
    set_dir = r"D:\semantic analysis\2016-10-05结果\新词//"

    k_list = util.get_key_list()

    for key in k_list:
        print(key)
        # 文件目录
        file_list = sorted(util.get_file_list(txt_dir + key, ".txt"))
        # 集合目录
        set_list = sorted(util.get_file_list(set_dir + key, ".pkl"))

        util.create_directory(result_dir + "新词//" + key + "//")

        i = 0
        while i < len(file_list):
            s_list = util.get_list_from_file(txt_dir + key + "//" +
                                             set_list[i][0:-4] + ".txt")
            new_word_list = util.get_nw(set_dir + key + "//" + set_list[i])
            # 过滤相同的语句,防止重复计算
            s_list = list(set(s_list))
            w_list = remark(s_list, new_word_list, key)
            html_name = file_list[i][:-4] + '.html'
            util.save_file(result_dir + "新词//" + key + "//" + html_name,
                           w_list)
            i += 1
Пример #2
0
def main1():
    # date_list = ["2012-08-05","2011-04-05","2011-03-28","2011-10-20","2012-12-30","2011-07-30","2011-06-09","2012-02-05","2012-12-16","2011-08-01","2011-05-19","2013-09-01","2012-08-01","2013-12-01"]
    # key_list = ["吐槽","纠结","淡定","自拍","正能量","山寨","达人","腹黑","接地气","扯淡","闷骚","不明觉厉","完爆","人艰不拆"]
    date_list = [
        "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31",
        "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31", "2013-12-31",
        "2013-12-31"
    ]
    key_list = [
        '努力',
        '感觉',
        '简单',
        '无聊',
        '希望',
        '美好',
        '气质',
        '害怕',
        '喜欢',
        '不约而同',
        '喜闻乐见',
    ]

    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\2016-10-09结果\html标记结果//'
    txt_dir = r"D:\semantic analysis\纯文本\常用词分句//"
    set_dir = r"D:\semantic analysis\2016-10-09结果\中间结果//"

    i = 0

    while i < len(key_list):
        key = key_list[i]
        print(key)
        # 文件目录
        file_list = sorted(util.get_file_list(txt_dir + key, ".txt"))
        # 集合目录
        set_dir_list = util.get_file_list(set_dir + key, ".pkl")
        set_list = []
        for set_list_dir in set_dir_list:
            set_list.append(util.get_nw(set_dir + key + "//" + set_list_dir))
            print(set_list_dir)

        util.create_directory(result_dir + key + "//")
        rr = cal_index2(date_list[i], txt_dir + key_list[i])
        j = 0
        # 每个分段
        while j < len(rr):
            k = 0
            while k < rr[j]:
                print(file_list[k][:-4])
                print(rr[j])
                txt_list = util.get_list_from_file(txt_dir + key + "//" +
                                                   file_list[k])
                w_list = remark(txt_list, set_list[j], key)
                html_name = file_list[k][:-4] + '.html'
                util.save_file(result_dir + key + "//" + html_name, w_list)
                k += 1
            j += 1
        i += 1
Пример #3
0
def cal_union_set(keyword, pkl_dir):
    set_list = sorted(util.get_file_list(pkl_dir, "pkl"))
    r_list = []
    d_list = []
    i = 0
    r_set0 = set()
    while i < len(set_list):
        r_set0 = r_set0 | util.get_nw(pkl_dir + set_list[i])
        r_list.append(set_list[i][0:-4] + '\t' + str(len(r_set0)))
        d_list.append(len(r_set0))
        i += 1

    i = 0
    r_list2 = []
    d_list2 = []
    while i < len(r_list)-1:
        r_list2.append(set_list[i][0:-4] + '\t' + str(d_list[i+1]-d_list[i]))
        d_list2.append(d_list[i+1]-d_list[i])
        i += 1

    i = 15
    r_list3 = []
    while i < len(d_list2):
        r_list3.append(set_list[i][0:-4] + '\t' + str(d_list2[i]/d_list[i]))
        i += 1

    return r_list, r_list2,r_list3
Пример #4
0
def cal_mcs(div, num, pkl_dir):
    os.chdir(pkl_dir)
    pkl_list = util.get_file_list(pkl_dir, '.pkl')
    # 从小到大排序
    pkl_list = sorted(pkl_list)
    # part_num = len(pkl_list) // div
    part_num = 200
    print("part_nume")
    print(part_num)
    if part_num < num:
        print("参数有误")
        return
    i = 0
    set_list = []
    # 块内的循环
    while i < div:
        # 进行合并操作
        j = 0
        r_set = set()
        while j < num:
            pkl_num = part_num * i + j
            set1 = util.get_nw(pkl_list[pkl_num])
            r_set = set1 | r_set
            j += 1
        # 把合并的结果加入列表中
        set_list.append(r_set)
        print(len(r_set))
        i += 1
Пример #5
0
def loop_compare(keyword_list, pkl_dir1, txt_dir1, result_dir, mode=1, lap=1):
    for key in keyword_list:
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")

        pkl_dir = pkl_dir1.format(key)
        txt_dir = txt_dir1.format(key)

        # 获取日期列表
        d_list = util.get_file_list(pkl_dir, '.pkl')
        d_list = [d.split(".")[0] for d in d_list]

        result_list = []
        # 升序排序
        d_list = sorted(d_list)
        ii = len(d_list) - 1

        while ii - lap >= 0:
            g1 = get_core_graph(pkl_dir + d_list[ii] + ".pkl")
            d1 = get_txt_dict(txt_dir + d_list[ii] + ".txt")

            # 迭代生成子图
            k = 1
            while k < lap:
                g1 = nx.compose(g1, util.get_nw(d_list[ii - k]))
                k += 1
            result_list.append(compare_function(d1, g1))
            ii -= lap
        util.save_file(result_dir + key + ".txt", result_list)
Пример #6
0
def cal_index2(date, pkl_dir):
    r_list = []
    f_list = util.get_file_list(pkl_dir, '.txt')
    import os
    os.chdir(pkl_dir)
    # 升序排序
    set_list = sorted(f_list)

    date2 = set_list[0][0:-4]
    i = 0
    date1 = date
    # 查找出分割点所在的序号
    while i < len(set_list) and util.compare_date(date1, date2) > 0.0:
        date2 = set_list[i][0:-4]
        i += 1
    r_list.append(i)

    j = 0
    part = 2
    part_num = (len(f_list) - i) // part
    while j < part - 1:
        j += 1
        r_list.append(i + j * part_num)

    r_list.append(len(f_list))
    return r_list
Пример #7
0
def cal_mcs(pkl_dir, mcs_dir, is_front, key_word, lap=1):
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)

    record_list = []
    num_list = []
    enum_list = []
    ii = len(nw_list) - 1
    # g2是2号 g1是1号,此处获取最末端的网络
    g2 = util.get_nw(nw_list[ii])

    # 迭代生成子图
    k = 1
    while k < lap:
        g2 = mcs(g2, util.get_nw(nw_list[ii - k]))
        k += 1

    while ii > 0:
        jj = ii
        ii -= lap
        # print(nw_list[ii])

        g1 = util.get_nw(nw_list[ii])
        # 迭代生成子图
        k = 1
        while k < lap:
            g1 = mcs(g1, util.get_nw(nw_list[ii - k]))
            k += 1

        # 生成连通子图
        g1 = mcs(g2, g1)

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.pkl'

        # 2016.9.20 用于测试,保存结果
        util.save_nw(
            g1, 'D://semantic analysis//nalyze_data//result//过程结果//连通子图//' +
            filename)

        if is_front:
            # 计算比例,1,2  跟1比
            pr = mcs_ratio(g1, g1, key_word)
            record_list.append(nw_list[jj][0:-4] + '\t' + str(pr))
        else:
            # 计算比例,1,2  跟2比
            pr = mcs_ratio(g1, g2, key_word)
            record_list.append(nw_list[jj][0:-4] + '\t' + str(pr))

        num_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_nodes()))
        enum_list.append(nw_list[jj][0:-4] + '\t' + str(g1.number_of_edges()))

        # 统计节点数
        # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file:
        #     for node in g1.nodes():
        #         file.write(node+'\n')
        # util.save_nw(g1,mcs_dir + filename)
        g2 = g1
Пример #8
0
def create(txt_dirs, xls_name):
    data_dict = {}
    for txt_dir in txt_dirs:
        os.chdir(txt_dir)
        file_list = util.get_file_list(txt_dir, 'txt')
        for file in file_list:
            key = file[0:-4]
            data_list = util.get_list_from_file(file)
            x_list = []
            y_list = []
            for data in data_list:
                item = data.split('\t')

                x_list.append(datetime.datetime.strptime(item[0], '%Y-%m-%d'))
                y_list.append(float(item[1]))
            if key in data_dict.keys():
                data_dict[key] = data_dict[key] + [x_list]
                data_dict[key] = data_dict[key] + [y_list]
            else:
                data_dict[key] = [x_list, y_list]

    bl_list = []
    ii = Num - 1
    while ii < Num:
        ii += 1
        bl_list.append(str(ii))

    # util.create_xlsx(data_dict, bl_list, txt_dirs[0] + xls_name + '.xlsx')
    util.create_xlsx(data_dict, bl_list, xls_name)
Пример #9
0
def loop_compare(com_function,
                 keyword_list,
                 pkl_dir1,
                 result_dir,
                 mode=1,
                 lap=1,
                 type="pkl"):
    for key in keyword_list:
        global keyword
        keyword = key
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")
        pkl_dir = pkl_dir1.format(key)
        f_list = util.get_file_list(pkl_dir, '.pkl')
        os.chdir(pkl_dir)
        result_list = []
        # 升序排序
        nw_list = sorted(f_list)
        ii = len(nw_list) - 1

        while ii - 2 * lap >= 0:
            g2 = util.get_nw(nw_list[ii])
            # 迭代生成子图
            # k = 1
            # while k < lap:
            #     g2 = nx.compose(g2, util.get_nw(nw_list[ii - k]))
            #     k += 1

            ii -= lap
            g1 = util.get_nw(nw_list[ii])
            # 迭代生成子图
            # k = 1
            # while k < lap:
            #     g1 = nx.compose(g1, util.get_nw(nw_list[ii - k]))
            #     k += 1

            # 生成连通子图
            # 相互比例
            if mode == 1:
                r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))
                result_list.append((nw_list[ii][0:-4] + "\t" + str(r2)))
            # 一对一
            elif mode == 0:
                result_list = com_function(copy.deepcopy(g1),
                                           copy.deepcopy(g2))
                util.save_file(
                    result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt",
                    result_list)
            # n对一
            elif mode == 2:
                r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), type)
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))

            ii -= lap
        if mode != 0:
            result_list.reverse()
            util.save_file(result_dir + key + ".txt", result_list)
Пример #10
0
def count_num_of_node(pkl_dir):
    pkl_file_list = util.get_file_list(pkl_dir, '.pkl')
    r_list = []
    for file in pkl_file_list:
        g = util.get_nw(file)
        s = file[0:10] + '\t' + str(g.number_of_nodes())
        r_list.append(s)
    return r_list
Пример #11
0
def main2():
    fir = "D:\semantic analysis\整理文本\正能量//"
    xml_list = util.get_file_list(fir, "txt")
    for xml in xml_list:
        w_list = set(util.get_list_from_file(fir + xml))
        r_list = remark(w_list, (['正能量']), None)
        html_name = xml[:-4] + '.html'
        util.save_file(fir + html_name, r_list)
def loop_compare(com_function,
                 keyword_list,
                 pkl_dir1,
                 result_dir,
                 mode=1,
                 lap=1,
                 type="pkl"):
    for key in keyword_list:
        print(key)
        if mode == 0:
            util.create_directory(result_dir + key + "//")
        pkl_dir = pkl_dir1.format(key)
        f_list = util.get_file_list(pkl_dir, '.txt')
        os.chdir(pkl_dir)
        result_list = []
        # 升序排序
        nw_list = sorted(f_list)
        ii = len(nw_list) - 1

        while ii - 2 * lap >= 0:
            g2 = util.txt2dict(util.get_list_from_file(nw_list[ii]))
            # 迭代生成子图
            k = 1
            while k < lap:
                g2 = nx.compose(g2, util.get_nw(nw_list[ii - k]))
                k += 1

            ii -= lap
            g1 = util.txt2dict(util.get_list_from_file(nw_list[ii]))
            d1 = util.get_nw(
                "D:\semantic analysis\新结果\去虚词去单字共现网络//{0}//p//".format(key) +
                nw_list[ii].split(".")[0] + ".pkl")
            # 迭代生成子图
            k = 1
            while k < lap:
                g1 = nx.compose(g1, util.get_nw(nw_list[ii - k]))
                k += 1

            # 生成连通子图
            if mode == 1:
                r1, r2 = com_function(copy.deepcopy(g1), copy.deepcopy(g2))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))
                result_list.append((nw_list[ii][0:-4] + "\t" + str(r2)))
            elif mode == 0:
                result_list = com_function(copy.deepcopy(g1),
                                           copy.deepcopy(g2))
                util.save_file(
                    result_dir + key + "//" + nw_list[ii + lap][0:-4] + ".txt",
                    result_list)
            elif mode == 2:
                r1 = com_function(copy.deepcopy(g1), copy.deepcopy(g2), d1)
                # result_list.append(str(r1))
                result_list.append(nw_list[ii + lap][0:-4] + "\t" + str(r1))

            ii -= lap
        if mode != 0:
            result_list.reverse()
            util.save_file(result_dir + key + ".txt", result_list)
Пример #13
0
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\新结果\共现网络//'
    txt_dir = r"D:\semantic analysis\新纯文本\1常用词//"
    # k_list = util.get_key_list()
    # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好']
    # 中心词
    k_list = ['美好']
    # 结巴分词词典的目录
    # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt")
    # jieba.initialize()
    pynlpir.open()
    for key in k_list:
        print(key)
        pynlpir.nlpir.AddUserWord(c_char_p(key.encode()))

    for key in k_list:
        print(key)
        # 文件目录
        file_list = util.get_file_list(txt_dir + key, ".txt")
        # 建立目录
        mk_dir(result_dir + key)
        # mk_dir(result_dir+key+'//w')
        mk_dir(result_dir + key + '//p')

        for n_file in file_list:
            s_list = util.get_list_from_file(txt_dir + key + "//" + n_file)
            # 过滤相同的语句,防止重复计算
            print(len(s_list))
            s_list = list(set(s_list))
            print(len(s_list))

            # 生成所有句子的网络
            # ps_list, mn, pps_list,pmn = create_matrix(s_list,key)
            pps_list, pmn = create_matrix(s_list, key)

            pkl_name = n_file[:-4] + '.pkl'

            # for w_list in ps_list:
            #     # 创建整句话的网络
            #     mn.add_edges(w_list)
            # util.save_nw(mn.get_network(), result_dir+key+'//w//' + pkl_name)

            for w_list in pps_list:
                # pmn.add_edges(w_list)
                pmn.add_gram_edges(w_list)
            util.save_nw(pmn.get_network(),
                         result_dir + key + '//p//' + pkl_name)

            print(n_file)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))

            with open(result_dir + key + '//record.txt', 'a',
                      encoding='utf-8') as rf:
                rf.write(n_file + '\n')
    pynlpir.close()
def individual_word(fre_path):
    plots_list = []
    file_list = util.get_file_list(fre_path, ".txt")
    for file in file_list:
        rl = util.get_list_from_dicttxt(fre_path + file)
        max_num = len(rl)
        x = list(range(max_num))
        y = rl
        plots_list.append((x, y))
    return file_list, plots_list
def cal_mcs(pkl_dir):
    os.chdir(pkl_dir)
    pkl_list = util.get_file_list(pkl_dir, '.pkl')
    # 从小到大排序
    pkl_list = sorted(pkl_list)

    m_set = util.get_nw(r"D:\semantic analysis\2016-10-03结果\达人//上升集合1.pkl")
    for file in pkl_list:
        set1 = util.get_nw(file)
        r_set = m_set & set1
        print(len(r_set) / len(m_set))
def main3():
    # keyword_list = ['扯淡', '腹黑', '接地气', '闷骚', '完爆', '正能量', '达人']
    keyword_list = ['淡定', '纠结', '山寨', '吐槽','自拍']
    for key in keyword_list:
        file_list = util.get_file_list(r"D:\semantic analysis\纯文本\新词//"+key,".txt")
        date_list = []
        for file_name in file_list:
            date_list.append(file_name[:-4])
        urlList,dateList = create_inital_address3(key,date_list)
        with open("url_list.txt","a") as f:
            for url in urlList:
                f.write(url+"\n")
Пример #17
0
def main1(keyword):
    dirr = 'D:\semantic analysis\pNet1\\' + keyword + '//p//'
    r_dir = 'D:\semantic analysis//3次采集结果\连续比例4//'
    pkl_list = util.get_file_list(dirr, '.pkl')
    pkl_list = sorted(pkl_list)
    for pkl in pkl_list:
        print(pkl)
    ll = len(pkl_list) - 1
    ii = ll
    g = util.get_nw(dirr + '\\' + pkl_list[ii])
    r_list = []
    n_list = []

    # 生成五个图的公共子图
    while ii >= ll - 3:
        ii -= 1
        g2 = util.get_nw(dirr + '\\' + pkl_list[ii])
        g = mcs(g2, g)
        print(pkl_list[ii] + '\t' + str(g.number_of_nodes()))

    ii = len(pkl_list) - 1
    while ii > 0:
        ii -= 1
        g2 = util.get_nw(dirr + '\\' + pkl_list[ii])
        rr, nn = mcs_ratio_advanced(g2, g, keyword)
        r_list.append(pkl_list[ii][0:-4] + '\t' + str(rr))
        n_list.append(pkl_list[ii][0:-4] + '\t' + str(nn))
    util.save_file(r_dir + keyword + '.txt', r_list)
    util.save_file(r_dir + 'n' + keyword + '.txt', n_list)


# for key in key_list:
#     print(key)
#     main1(key)

# g2 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-31.pkl')
# g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-30.pkl')
# g1 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-29.pkl')
# g4 = util.get_nw('D:\semantic analysis\c_date\给力\p//2011-03-28.pkl')
#
#
# g5 = mcs(g2, g1)
# g6 = mcs(g1, g5)
# g7 = mcs(g4, g6)
# r = g1.number_of_nodes() / g2.number_of_nodes()
# print("的节点数:"+str(g2.number_of_nodes()))
# print("的节点数:"+str(g1.number_of_nodes()))
# print("两者公共子图的节点数:"+str(g5.number_of_nodes()))
# print("两者公共子图的节点数:"+str(g6.number_of_nodes()))
# print("两者公共子图的节点数:"+str(g7.number_of_nodes()))

# print("比值1:" + str(g1.number_of_nodes() / g2.number_of_nodes()))
# print("比值2:" + str(g1.number_of_nodes() / g1.number_of_nodes()))
def individual_word_xy(fre_path):
    plots_list = []
    file_list = util.get_file_list(fre_path, ".txt")
    # 删除首个

    for file in file_list:
        xy_list = util.get_list_from_file(fre_path + file)
        del [xy_list[-1]]
        x = []
        y = []
        for xy in xy_list:
            x.append(xy.split("\t")[0])
            y.append(xy.split("\t")[1])
        plots_list.append((x, y))
    return file_list, plots_list
def accumlate_ratio(fre_path):
    file_name_list = []
    plots_list = []
    file_list = util.get_file_list(fre_path, ".txt")
    file_name_list.extend(file_list)
    for file in file_list:
        print(file)
        r = util.get_list_from_file(fre_path + file)
        rl = []
        for ii in r:
            rl.append(float(ii))
            # rl.append(float(ii.split("\t")[1]))
        max_num = len(rl)
        x = list(range(max_num))
        y = rl
        # print(sum(y[0:100]))
        plots_list.append((x, y))
    return file_name_list, plots_list
def cal_index(date_list, pkl_dir):
    r_list = []
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    set_list = sorted(f_list)

    date2 = set_list[0][0:-4]
    i = 0
    for date in date_list:
        date1 = date
        # 查找出分割点所在的序号
        while i < len(set_list) and util.compare_date(date1, date2) > 0.0:
            date2 = set_list[i][0:-4]
            i += 1
        r_list.append(i)
    r_list.append(len(f_list))
    return r_list
Пример #21
0
def loop_compare(com_function,
                 keyword_list,
                 pkl_dir1,
                 result_dir,
                 mode=1,
                 lap=1,
                 type="pkl"):
    for keyword in keyword_list:
        pkl_dir = pkl_dir1.format(keyword)
        f_list = util.get_file_list(pkl_dir, '.pkl')
        os.chdir(pkl_dir)
        # 升序排序
        nw_list = sorted(f_list)

        record_list = []
        ii = len(nw_list) - 1
        # g2是2号 g1是1号,此处获取最末端的网络
        g2 = util.get_nw(nw_list[ii])

        # 迭代生成子图
        k = 1
        while k < lap:
            g2 = nx.compose(g2, util.get_nw(nw_list[ii - k]))
            k += 1

        while ii - lap >= 0:
            jj = ii
            ii -= lap
            # print(nw_list[ii])

            g1 = util.get_nw(nw_list[ii])
            # 迭代生成子图
            k = 1
            while k < lap:
                g1 = nx.compose(g1, util.get_nw(nw_list[ii - k]))
                k += 1

            # 计算比例
            r1 = com_function(g1, g2)
            record_list.append(nw_list[jj][0:-4] + '\t' + str(r1))

            g2 = g1
        record_list.reverse()
        util.save_file(result_dir + keyword + ".txt", record_list)
Пример #22
0
def main():
    k_list = util.get_key_list()
    jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt")
    jieba.initialize()
    for key in k_list:
        print(key)
        file_list = util.get_file_list(
            'D://semantic analysis//analyze_data//fc//' + key, ".txt")
        # 建立目录
        mk_dir('./w')
        mk_dir('./p')

        for n_file in file_list:
            s_list = util.get_list_from_file(n_file)
            # 过滤相同的语句,防止重复计算
            print(len(s_list))
            s_list = list(set(s_list))
            print(len(s_list))
            wg = nx.Graph()
            pg = nx.Graph()

            for sentence in s_list:
                # 创建整句话的网络
                ll = util.input_filer(sentence)
                wg = add_s2g(wg, ' '.join(ll))

                # 只创建关键词所在的分句的网络
                for ss in ll:
                    if (key in ss):
                        pg = add_s2g(pg, ss)

            pkl_name = n_file[:-4] + '.pkl'
            util.save_nw(pg, './/p//' + pkl_name)
            util.save_nw(wg, './/w//' + pkl_name)

            print(n_file)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))

            with open('record.txt', 'a', encoding='utf-8') as rf:
                rf.write(n_file + '\n')
Пример #23
0
def cal_node_mcs(pkl_dir, mcs_dir, key_word, lap=2):
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    record_list = []
    num_list = []
    enum_list = []
    ii = len(nw_list) - 1

    while (ii - lap + 1) >= 0:
        # print(nw_list[ii])
        g1 = util.get_nw(nw_list[ii])
        # 迭代生成子图
        k = 1
        while k < lap:
            g1 = mcs(g1, util.get_nw(nw_list[ii - k]))
            k += 1

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '.pkl'

        # 保存结果
        pkl_dir = r"D:\semantic analysis\公共子图节点数\新词\30公共子图//" + key_word + "//"
        util.create_directory(pkl_dir)
        util.save_nw(g1, pkl_dir + nw_list[ii][0:-4])

        num_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_nodes()))
        enum_list.append(nw_list[ii][0:-4] + '\t' + str(g1.number_of_edges()))

        # 统计节点数
        # with open(mcs_dir + filename[0:-4]+'.txt','w',encoding='utf-8') as file:
        #     for node in g1.nodes():
        #         file.write(node+'\n')
        # util.save_nw(g1,mcs_dir + filename)

        ii -= lap

    # util.save_file(mcs_dir + key_word+'mcs.txt', record_list)
    util.save_file(mcs_dir + 'n' + key_word + 'mcs.txt', num_list)
    util.save_file(mcs_dir + 'e' + key_word + 'mcs.txt', enum_list)
Пример #24
0
def loop_key2(pkl_dir, result_dir, key_word, lap=1):
    pkl_dir = pkl_dir.format(key_word)
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    ii = 0
    # g2是2号 g1是1号,此处获取最末端的网络
    g1 = util.get_nw(nw_list[ii])
    util.create_directory(result_dir + key_word)

    while ii < len(nw_list) - lap:
        ii += lap
        g2 = util.get_nw(nw_list[ii])

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '.txt'

        result_list = extract_new_nodes_attributes(g1, g2)
        util.save_file(result_dir + key_word + "//" + filename, result_list)
        g1 = nx.compose(g1, g2)
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\新纯文本\1新词分句//'
    txt_dir = r"D:\semantic analysis\新纯文本\1新词//"

    k_list = util.get_key_list()

    for key in k_list:
        print(key)
        # 文件目录
        file_list = util.get_file_list(txt_dir + key, ".txt")

        # 建立目录
        # mk_dir(result_dir+"新词整句//"+key)
        mk_dir(result_dir + key)

        for file in file_list:
            s_list = util.get_list_from_file(txt_dir + key + "//" + file)
            # 过滤相同的语句,防止重复计算
            # s_list = list(set(s_list))
            w_list, p_list = extract_sentence(s_list, key)
            util.save_file(result_dir + key + "//" + file, p_list, True)
Пример #26
0
def cal_mcs(pkl_dir):
    os.chdir(pkl_dir)
    pkl_list = util.get_file_list(pkl_dir, '.pkl')
    # 从小到大排序
    pkl_list = sorted(pkl_list)

    r_set = set()
    r_set1 = set()
    i = 0
    while i < 40:
        set1 = util.get_nw(pkl_list[i])
        set2 = util.get_nw(pkl_list[len(pkl_list) - 1 - i])
        r_set = set1 | r_set
        r_set1 = set2 | r_set1
        i += 1

    print(len(r_set))
    print(len(r_set1))

    r_set2 = r_set & r_set1
    print(len(r_set2))
    print(len(r_set2) / len(r_set))
Пример #27
0
def loop_key(pkl_dir, result_dir, key_word, lap=1):
    pkl_dir = pkl_dir.format(key_word)
    f_list = util.get_file_list(pkl_dir, '.pkl')
    os.chdir(pkl_dir)
    # 升序排序
    nw_list = sorted(f_list)
    ii = len(nw_list) - 1
    # g2是2号 g1是1号,此处获取最末端的网络
    g2 = util.get_nw(nw_list[ii])
    util.create_directory(result_dir + key_word)

    while ii > 0:
        jj = ii
        ii -= lap
        # print(nw_list[ii])
        g1 = util.get_nw(nw_list[ii])

        # 生成文件名字
        filename = nw_list[ii][0:-4] + '-' + nw_list[jj][0:-4] + '.txt'

        result_list = cal_connect_real_probability(g1, g2, key_word)
        util.save_file(result_dir + key_word + "//" + filename, result_list)

        g2 = g1
Пример #28
0
import re
import tool.util as util

key_list = util.get_key_list()

dict_dir = r"D:\semantic analysis\2016-10-09结果\词频1//"
for key in key_list:
    print(key)
    set_dict, file_name = util.get_objdict_list(dict_dir + key, ".txt")
    date_list = util.get_file_list(dict_dir + key, ".txt")
    pattern = re.compile(r"(\d*-\d*)-\d*")
    month_array = pattern.findall(" ".join(date_list))
    month_array = ["2010"]

    util.create_directory(r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" +
                          key)

    # 循环查找月份
    for month in month_array:
        pattern = re.compile(r"(" + month + "-\d*-\d*)")
        date_array = pattern.findall(" ".join(date_list))
        print(date_array)
        # 循环合并月份频数字典
        r_dict = dict()
        for file_date in date_array:
            r_dict = util.union_dicts(set_dict[file_date + ".txt"], r_dict)
        util.save_dict_list(
            r_dict,
            r"D:\semantic analysis\2016-10-12结果\2010年频数统计//" + key + ".txt")
Пример #29
0
import tool.util as util
# 计算所有文件里面所有词语出现的次数
key_list = util.get_key_list() + util.get_key_list2()
txt_dir = r"D:\semantic analysis\2016-10-09结果\词频1//"
r_dir = r"D:\semantic analysis\2016-10-09结果\总数1//"
for key in key_list:
    print(key)
    file_dir = txt_dir + key
    dict_list = []
    r_list = []
    file_list = util.get_file_list(file_dir, ".txt")
    for file_name in file_list:
        word_list = sorted(
            util.get_list_from_file(txt_dir + key + "//" + file_name))
        dict_list.append(util.txt2dict(word_list))

    r_dict = dict_list[0]
    for word_dict in dict_list:
        r_dict = util.union_dicts(r_dict, word_dict)

    # 对key进行排序
    kk = util.sort_by_value(r_dict)
    w_list = util.create_dict_list(kk, r_dict)

    # 创建目录
    util.save_file(r"D:\semantic analysis\2016-10-09结果\总数1//" + key + ".txt",
                   w_list, False)
Пример #30
0
import networkx as nx
import tool.util as util
import os

dir = "D:\semantic analysis\新结果\去虚词去单字共现网络\纠结\p//"
os.chdir(dir)
file_list = util.get_file_list(dir, "pkl")
s1 = set()
l1 = list()

sg = util.get_nw(file_list[0])
for file in file_list[0:20]:
    g = util.get_nw(file)
    sg = nx.compose(g, sg)

gs = set(sg.nodes())
print("gs ", end=" ")
print(len(gs))
for file in file_list[0:20]:
    g = util.get_nw(file)
    s1 = s1 | set(nx.k_core(g).nodes())
    l1.extend(nx.k_core(g).nodes())

print("s1 " + str(len(s1)))
print("l1 " + str(len(l1)))

s2 = set()
l2 = list()
for file in file_list[-50:]:
    g = util.get_nw(file)
    s2 = s2 | set(nx.k_core(g).nodes())