예제 #1
0
def doTC_dict(txt_dict, minTC=0, topN=None):  # 快
    """进行TC特征降维"""
    # 计算各文本各词itc权值的tfidf矩阵
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    # 根据TC权值筛选单词
    newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN)
    # 根据新的单词集(特征集)压缩数据
    newData = selectData(txt_dict, newWordName)
    return newData
예제 #2
0
def test_myTC():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')  # 0.6s
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_dict = myTC_dict(tfidf_dict, rtype=list)
    tc_array = myTC_array(tfidf_array)
    # myTC_array 比 myTC_dict 快
    # print(sum(tc_dict - tc_array))
    # 可以借助dataInfo内的函数、查看TC的数据分布
    print(fiveNumber(tc_array))
    showDistplot(tc_array)
예제 #3
0
def doTC_array(txt_dict, minTC=0, topN=None):
    """进行TC特征降维"""
    # 计算各文本各词itc权值的tfidf矩阵
    txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int)
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array = dict2Array(tfidf_dict)[0]
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    # 根据TC权值筛选单词
    newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN)
    # 根据新的单词集(特征集)压缩数据
    newData = selectData(txt_array,
                         newWordName,
                         oldWordName=wordName,
                         orderchange=False)
    return newData, newWordName
예제 #4
0
def test_selectFeature():
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess_test')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    tc_array = myTC_array(tfidf_array)
    minTC, topN = 0, 100
    tc = tc_array
    wordAndIdx = list(zip(wordName, tc))
    wordAndIdx.sort(key=lambda x: x[1], reverse=True)  # 按tc排序
    newWordName = [wordAndIdx[i][0] for i in range(topN)]
    newWordName.sort()

    idx = tc.argsort()
    idx = idx[:-topN - 1:-1]
    idx.sort()
    newWordName2 = [wordName[i] for i in idx]
    for j in range(len(newWordName2)):
        if newWordName2[j] != newWordName[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
예제 #5
0
    txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' %
                            'afterProccess')
    minTC, topN = 0, 10000
    tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN)
    tcData_dict = doTC_dict(txt_dict, minTC, topN)
    tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict,
                                                                   dtype=int)
    # doTC_dict 比 doTC_array 快
    for j in range(len(tcWordName)):
        if tcWordName[j] != tcWordName_dict_array[j]:
            print("%d tcWordName[i]!=tcWordName_dict_array" % j)
            break
    print(sum(sum(tcData_dict_array - tcData_array)))


if __name__ == '__main__':
    txt_dict = getWordCount(
        '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2')
    tfidf_dict = myTFIDF(txt_dict, itc=True)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    # 计算各词的单词权值
    tc_array = myTC_array(tfidf_array)
    showDistplot(tc_array)
    tc_array.sort()
    tc_array = tc_array[::-1]
    from matplotlib import pyplot as plt

    plt.plot(range(len(tc_array)), tc_array)
    plt.ylim(0, 200)
    plt.show()
예제 #6
0
            if minIndex in clusterLabel_map:
                clusterLabel_map[minIndex].append(i)
            else:
                clusterLabel_map[minIndex] = [i]
        # 更新中心
        for i in range(k):
            Cent[i, :] = numpy.mean(data[clusterLabel_map[i], :], axis=0)
    print(iter)
    return clusterLabel


if __name__ == '__main__':

    outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2'
    txt_dict = getWordCount(outDir)
    tfidf_dict = myTFIDF(txt_dict, itc=False)
    data, textNames, wordName = dict2Array(tfidf_dict)
    # 降维
    topN = 1200
    data, textNames = PCA(txt_dict, topN=topN, itc=False)[:2]
    # 确定特征维数
    for x in [i * 0.1 for i in range(1, 10)]:
        data, textNames = PCA(txt_dict, topN=x, itc=False)[:2]
        print(x, data.shape)
    # 结果:0.1 74 0.2 204 0.3 357 0.4 519 0.5 684 0.6 851 0.7 1022 0.8 1198 0.9 1387
    # [74, 204, 357, 519, 684, 851, 1022, 1198, 1387]
    #
    #
    # # 肘方法看k值
    # kList = range(5, 40, 1)
    # d = []
예제 #7
0
def TC_PCA(txt_dict, minTC=0, topN=None, itc=False):  # 45s
    newData_dict = doTC_dict(txt_dict, minTC=minTC)
    tfidf_dict = myTFIDF(newData_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName
예제 #8
0
def PCA(txt_dict, topN=None, itc=False):  # 137s
    tfidf_dict = myTFIDF(txt_dict, itc=itc)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = pca_sklearn(tfidf_array, topN=topN)
    return newData_mat, txtName
예제 #9
0
def TC(txt_dict, topN):  # 7.6S
    newData_dict = doTC_dict(txt_dict, topN=topN)
    tfidf_dict = myTFIDF(newData_dict, itc=False)
    tfidf_array, txtName, wordName = dict2Array(tfidf_dict)
    newData_mat = numpy.mat(tfidf_array)
    return newData_mat, txtName, wordName