示例#1
0
文件: project2.py 项目: shacocn/EE219
def plot_measures_with_r_svd(tfidf_transformer, all_target_group):

    hom = []
    com = []
    vmeas = []
    rand = []
    mut = []
    r_arr = [1, 2, 3, 5, 10, 20, 50, 100, 300]
    svd = TruncatedSVD(n_components=1000)
    svd_data = svd.fit_transform(tfidf_transformer)

    for r in r_arr:
        km = perform_kmeans(svd_data[:, :r], 2)  #?
        print('confusion matrix for r as', r)
        print(confusion_matrix(all_target_group, km.labels_))
        append_measures(all_target_group, km.labels_, hom, com, vmeas, rand,
                        mut)
    plt.plot(r_arr, hom, 'r', label='homogeneity')
    plt.plot(r_arr, com, 'g', label='completeness')
    plt.plot(r_arr, vmeas, 'b', label='v_measure')
    plt.plot(r_arr, rand, 'y', label='adjusted_rand')
    plt.plot(r_arr, mut, 'k', label='adjusted_mutual_info')
    plt.legend()
    plt.xlabel('principal component r with svd')
    plt.ylabel('measures')
    plt.title('SVD')
    plt.show()
示例#2
0
文件: project2.py 项目: shacocn/EE219
def plot_percent_variance(tfidf_transformer):
    svd = TruncatedSVD(n_components=1000)  #change to 1000
    svd.fit_transform(tfidf_transformer)
    ratio = svd.explained_variance_ratio_
    singular_values = svd.singular_values_  #where is this used?

    #retained value
    sum_arr = []
    for k in range(1, 1001):  #change to 1001
        sum = 0.0
        for i in range(k):
            sum = sum + ratio[i]
        sum_arr.append(sum)

    x_values = range(1, 1001)

    plt.plot(x_values, sum_arr)
    plt.xlabel('principal component r')
    plt.ylabel('percent variance')
    plt.show()
示例#3
0
文件: project2.py 项目: shacocn/EE219
def visualizePerformance(tfidf_transformer, target, k,
                         n):  #target, num_cluster, n_component
    print("svd at its best score r =", n)
    svd = TruncatedSVD(n_components=n)  #best score is 2
    svd_data = svd.fit_transform(tfidf_transformer)
    km = KMeans(n_clusters=k, n_init=30, random_state=42).fit_predict(svd_data)
    pca = PCA(n_components=n).fit_transform(svd_data)
    plt.scatter(pca[:, 0], pca[:, 1], c=km)
    plt.show()

    print("nmf at its best score r = ", n)
    nmf = NMF(
        n_components=n, init='random',
        random_state=42)  #best score is 3 from the contingency matrix [1][1]
    nmf_data = nmf.fit_transform(tfidf_transformer)
    km = KMeans(n_clusters=k, n_init=30, random_state=42).fit_predict(nmf_data)
    plt.scatter(nmf_data[:, 0], nmf_data[:, 1], c=km)
    plt.show()

    print('part b --------')
    # svd normalzed
    print('svd normalized')

    svd_norm = normalize(svd_data)
    kmeans = KMeans(n_clusters=k, n_init=30)
    km = kmeans.fit_predict(svd_norm)
    pca = PCA(n_components=n).fit_transform(svd_norm)
    plt.scatter(pca[:, 0], pca[:, 1], c=km)
    plt.show()
    print('contingency matrix: ')
    print(confusion_matrix(target, kmeans.labels_))
    print_five_measures(target, kmeans.labels_)

    print('nmf normalzed')

    nmf_norm = normalize(nmf_data)
    kmeans = KMeans(n_clusters=k, n_init=30)
    km = kmeans.fit_predict(nmf_norm)
    plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km)
    plt.show()
    print('contingency matrix: ')
    print(confusion_matrix(target, kmeans.labels_))
    print_five_measures(target, kmeans.labels_)

    #2nd bullet
    print('applying log transformation after NMF hear:')

    logTransform = FunctionTransformer(
        np.log1p)  #(log10, log2, log1p, emath.log) => only log1p works lol
    nmf_log = logTransform.transform(nmf_data)
    kmeans = KMeans(n_clusters=k, n_init=30)
    km = kmeans.fit_predict(nmf_log)
    plt.scatter(nmf_log[:, 0], nmf_log[:, 1], c=km)
    plt.show()
    print('contingency matrix: ')
    print(confusion_matrix(target, kmeans.labels_))
    print_five_measures(target, kmeans.labels_)

    # 3rd bullet
    print('log then norm')

    nmf_log = logTransform.transform(nmf_data)
    nmf_log_norm = normalize(nmf_log)
    kmeans = KMeans(n_clusters=k, n_init=30)
    km = kmeans.fit_predict(nmf_norm)
    plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km)
    plt.show()
    print('contingency matrix: ')
    print(confusion_matrix(target, kmeans.labels_))
    print_five_measures(target, kmeans.labels_)

    print('norm then log')

    nmf_norm = normalize(nmf_data)
    nmf_norm_log = logTransform.transform(nmf_norm)
    kmeans = KMeans(n_clusters=k, n_init=30)
    km = kmeans.fit_predict(nmf_norm)
    plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km)
    plt.show()
    print('contingency matrix: ')
    print(confusion_matrix(target, kmeans.labels_))
    print_five_measures(target, kmeans.labels_)
示例#4
0
    ax.scatter(xx, yy)
    ax.grid()
    ax.set_title('Исходная информация')
    ax.set_xlabel('первый признак')
    ax.set_ylabel('второй признак')

U, S, VT = svd(X)
print("Левые сингулярные векторы:")  #столбец - это вектор
print(U)
print("Матрица сингулярных значений:")  #сингулярные числа - на диагонали
print(np.diag(S))
print("Правые сингулярные векторы:")  #строка - это вектор
print(VT)

svd = TruncatedSVD(n_components=dec_feat)
X_transf = svd.fit_transform(X)  #выделяет наиболее информативные признаки
print('Сколько процентов объясняет каждый признак')
print(svd.explained_variance_ratio_)
print('Сколько процентов объясняет уменьшенная матрица')
print(svd.explained_variance_ratio_.sum())

print("Преобразованная матрица после SVD:")
print(X_transf)

if dec_feat == 3:
    xx = X_transf[:, 0]
    yy = X_transf[:, 1]
    zz = X_transf[:, 2]
    fig = plt.figure(3, figsize=(6, 5))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(xx, yy, zz)