Пример #1
0
def test_myTFIDF():
    txtdict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test')  # 0.6s
    data, txtName, wordName = dict2Array(txtdict, dtype=int)
    tfidf2 = myTFIDF_array(data, itc=True)
    tfidf1 = myTFIDF(txtdict, itc=True)
    # myTFIDF_dict 比 myTFIDF_array 快
    dd = dict2Array(tfidf1)[0]
    cc = dd - tfidf2
    fdd = numpy.abs(cc)
    print(sum(sum(fdd)))  # 误差在1e-15*n 浮点运算 精度下降
Пример #2
0
def test_myTC():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')  # 0.6s
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_dict = myTC_dict(tfidf_dict, rtype=list)
    tc_array = myTC_array(tfidf_array)
    # myTC_array 比 myTC_dict 快
    # print(sum(tc_dict - tc_array))
    # 可以借助dataInfo内的函数、查看TC的数据分布
    print(fiveNumber(tc_array))
    showDistplot(tc_array)
Пример #3
0
def test_pca_sklearn():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    testdata = txt_array[:, 1:500]

    # newData_dict = doTC_dict(txt_dict, topN=1000)
    # testdata, txtName, wordName = dict2Array(newData_dict, dtype=int)
    dataMat = numpy.mat(testdata)

    print(dataMat.shape)
    topNfeat = 50
    lowDDataMat, redEigVects = myPCA_R(dataMat, topN=topNfeat)
Пример #4
0
def test_doTC():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')
    minTC, topN = 0, 10000
    tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN)
    tcData_dict = doTC_dict(txt_dict, minTC, topN)
    tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict,
                                                                   dtype=int)
    # doTC_dict 比 doTC_array 快
    for j in range(len(tcWordName)):
        if tcWordName[j] != tcWordName_dict_array[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
    print(sum(sum(tcData_dict_array - tcData_array)))
Пример #5
0
def test_pca():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    dataMat = numpy.mat(txt_array[:, :1000], dtype=numpy.float64)
    topN = 100
    newData, U, rdata = myPCA(dataMat, topN=topN, onlyNewData=False)
    newData2, U2, rdata2 = myPCA_R(dataMat, topN=topN, onlyNewData=False)
    newData3, U3, rdata3 = pca_sklearn(dataMat, topN=topN, onlyNewData=False)
    # showDiff(newData, newData2)
    # dd = getDiff(dataMat, newData, U)
    # rdata = newData * U.T + numpy.mean(dataMat, axis=0)
    # print(dd,rdata.max())
    print(getDiff(dataMat, rdata))
    print(getDiff(dataMat, rdata2))
    print(getDiff(dataMat, rdata3))
Пример #6
0
def test_selectData():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')  # 0.6s
    txt_array, txtName, wordName = dict2Array(txt_dict)

    newWid = random.sample(range(0, len(wordName)), 300)
    newWid.sort()
    newWordname = [wordName[i] for i in newWid]
    sfdata = selectData_dict(txt_dict, newWordname)
    sfdata2 = selectData_array(txt_array, newWordname, oldWordName=wordName)
    sfdata22 = selectData_array(txt_array,
                                newWordname,
                                oldWordName=wordName,
                                orderchange=False)
    # 当newWordname顺序改变时即orderchange=True
    # selectData_dict 比 selectFeature_array快
    sfdata1 = dict2Array(sfdata)[0]
    print(sum(sum(sfdata1 - sfdata2)))
Пример #7
0
def test_selectFeature():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_array = myTC_array(tfidf_array)
    minTC, topN = 0, 100
    tc = tc_array
    wordAndIdx = list(zip(wordName, tc))
    wordAndIdx.sort(key=lambda x: x[1], reverse=True)  # 按tc排序
    newWordName = [wordAndIdx[i][0] for i in range(topN)]
    newWordName.sort()

    idx = tc.argsort()
    idx = idx[:-topN - 1:-1]
    idx.sort()
    newWordName2 = [wordName[i] for i in idx]
    for j in range(len(newWordName2)):
        if newWordName2[j] != newWordName[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
Пример #8
0
def do_treecluster_images():
    """特征维度对各层次聚类的影响"""
    outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2'
    txt_dict = getWordCount(outDir)

    xx = range(100, 1000, 100)
    xx = [300, 600]
    for topN in xx:
        data, textNames = TC(txt_dict, topN=topN)[:2]
        # # 不降维
        # tfidf_dict = myTFIDF(txt_dict, itc=False)
        # data, textNames, wordName = dict2Array(tfidf_dict)

        # method 's': 最小距离法  'm': 最大距离法 'c': 重心法  'a': 类平均法
        # dist e 欧式距离 u 余弦距离
        tree = treecluster(data=data, method='m', dist='e')
        # tree2 = treecluster(data=data, method='s', dist='e')
        # tree3 = treecluster(data=data, method='a', dist='e')
        # tree4 = treecluster(data=data, method='c', dist='e')
        args = range(2, 50)
        # args = list(range(2, 15, 3)) + [21, 27, 30, 40, 50, 60, 70, 80, 100, 150, 250]
        d = [[], [], [], [], []]  # 轮廓系数
        ksize = [[], [], [], [], []]  # 最大类的大小
        for k in args:
            clusterid = tree.cut(nclusters=k)
            d[0].append(silhouette_score(data, clusterid, metric='euclidean'))
            ksize[0].append(max(size_of_cluster(clusterid)))
            clustering = AgglomerativeClustering(linkage='ward', n_clusters=k)  # ['ward','complete','average']
            clustering.fit(data)
            d[1].append(silhouette_score(data, clustering.labels_, metric='euclidean'))
            ksize[1].append(max(size_of_cluster(clustering.labels_)))
            # clusterid2 = tree2.cut(nclusters=k)
            # d[2].append(silhouette_score(data, clusterid2, metric='euclidean'))
            # ksize[2].append(max(size_of_cluster(clusterid2)))
            # clusterid3 = tree3.cut(nclusters=k)
            # d[3].append(silhouette_score(data, clusterid3, metric='euclidean'))
            # ksize[3].append(max(size_of_cluster(clusterid3)))
            # clusterid4 = tree4.cut(nclusters=k)
            # d[4].append(silhouette_score(data, clusterid4, metric='euclidean'))
            # ksize[4].append(max(size_of_cluster(clusterid4)))

            # d[2].append(hierarchical(data, k, 'complete'))#m,e
            # d[3].append(hierarchical(data, k, 'average'))#a,e
        # 用subplot()方法绘制多幅图形
        plt.figure(figsize=(6, 6))
        # 创建第一个画板
        plt.figure(1)
        # 将第一个画板划分为2行1列组成的区块,并获取到第一块区域
        ax1 = plt.subplot(211)
        realN = 0
        # 在第一个子区域中绘图
        for di in d:
            if len(di) > 1:
                plt.plot(args, di, marker='o')
                realN += 1
        # plt.legend(xx)
        plt.legend(range(realN))
        plt.xlabel = 'k'
        plt.ylabel = 'silhouette'
        # plt.ylim(-1, 1)

        # 选中第二个子区域,并绘图
        ax2 = plt.subplot(212)
        for di in ksize:
            if len(di) > 1:
                plt.plot(args, di, marker='o')
        plt.legend(range(realN))
        plt.xlabel = 'k'
        plt.ylabel = 'MAXcluster'
        # plt.ylim(0, 2000)
        ax1.set_title('feature number=%d by TC' % topN)
        ax2.set_title("max size of clusters")
        plt.savefig('./treecluster_images/feature number=%d by TC 1<k<50' % topN)
        plt.show()
Пример #9
0
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')
    minTC, topN = 0, 10000
    tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN)
    tcData_dict = doTC_dict(txt_dict, minTC, topN)
    tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict,
                                                                   dtype=int)
    # doTC_dict 比 doTC_array 快
    for j in range(len(tcWordName)):
        if tcWordName[j] != tcWordName_dict_array[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
    print(sum(sum(tcData_dict_array - tcData_array)))


if __name__ == '__main__':
    txt_dict = getWordCount(
        '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    showDistplot(tc_array)
    tc_array.sort()
    tc_array = tc_array[::-1]
    from matplotlib import pyplot as plt

    plt.plot(range(len(tc_array)), tc_array)
    plt.ylim(0, 200)
    plt.show()
Пример #10
0
                clusterLabel[i] = minIndex
            if minIndex in clusterLabel_map:
                clusterLabel_map[minIndex].append(i)
            else:
                clusterLabel_map[minIndex] = [i]
        # 更新中心
        for i in range(k):
            Cent[i, :] = numpy.mean(data[clusterLabel_map[i], :], axis=0)
    print(iter)
    return clusterLabel


if __name__ == '__main__':

    outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2'
    txt_dict = getWordCount(outDir)
    tfidf_dict = myTFIDF(txt_dict, itc=False)
    data, textNames, wordName = dict2Array(tfidf_dict)
    # 降维
    topN = 1200
    data, textNames = PCA(txt_dict, topN=topN, itc=False)[:2]
    # 确定特征维数
    for x in [i * 0.1 for i in range(1, 10)]:
        data, textNames = PCA(txt_dict, topN=x, itc=False)[:2]
        print(x, data.shape)
    # 结果:0.1 74 0.2 204 0.3 357 0.4 519 0.5 684 0.6 851 0.7 1022 0.8 1198 0.9 1387
    # [74, 204, 357, 519, 684, 851, 1022, 1198, 1387]
    #
    #
    # # 肘方法看k值
    # kList = range(5, 40, 1)
Пример #11
0
@log("Feature_useTime")
def PCA(txt_dict, topN=None, itc=False):  # 137s
    tfidf_dict = myTFIDF(txt_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName


@log("Feature_useTime")
def TC_PCA(txt_dict, minTC=0, topN=None, itc=False):  # 45s
    newData_dict = doTC_dict(txt_dict, minTC=minTC)
    tfidf_dict = myTFIDF(newData_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName


if __name__ == '__main__':
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess')  # 0.6s
    topN = 1800
    # newData_mat, txtName, wordName = TC(txt_dict, topN)
    # newData_mat2, txtName2 = PCA(txt_dict, topN=topN)
    newData_mat3, txtName3 = TC_PCA(txt_dict, minTC=0, topN=topN)
    # numpy.savetxt('data_TC_1800', newData_mat, delimiter=",")
    # numpy.savetxt('data_PCA_1800', newData_mat2, delimiter=",")
    numpy.savetxt('data_TC_PCA_1800', newData_mat3, delimiter=",")
    # TEST
    # txt_dict_test = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s_test' % 'afterProccess')
    # newData_mat3, txtName3, wordName3 = TC(txt_dict_test, topN)
    # numpy.savetxt('data_test', newData_mat3, delimiter=",")