def plot_measures_with_r_svd(tfidf_transformer, all_target_group): hom = [] com = [] vmeas = [] rand = [] mut = [] r_arr = [1, 2, 3, 5, 10, 20, 50, 100, 300] svd = TruncatedSVD(n_components=1000) svd_data = svd.fit_transform(tfidf_transformer) for r in r_arr: km = perform_kmeans(svd_data[:, :r], 2) #? print('confusion matrix for r as', r) print(confusion_matrix(all_target_group, km.labels_)) append_measures(all_target_group, km.labels_, hom, com, vmeas, rand, mut) plt.plot(r_arr, hom, 'r', label='homogeneity') plt.plot(r_arr, com, 'g', label='completeness') plt.plot(r_arr, vmeas, 'b', label='v_measure') plt.plot(r_arr, rand, 'y', label='adjusted_rand') plt.plot(r_arr, mut, 'k', label='adjusted_mutual_info') plt.legend() plt.xlabel('principal component r with svd') plt.ylabel('measures') plt.title('SVD') plt.show()
def plot_percent_variance(tfidf_transformer): svd = TruncatedSVD(n_components=1000) #change to 1000 svd.fit_transform(tfidf_transformer) ratio = svd.explained_variance_ratio_ singular_values = svd.singular_values_ #where is this used? #retained value sum_arr = [] for k in range(1, 1001): #change to 1001 sum = 0.0 for i in range(k): sum = sum + ratio[i] sum_arr.append(sum) x_values = range(1, 1001) plt.plot(x_values, sum_arr) plt.xlabel('principal component r') plt.ylabel('percent variance') plt.show()
def visualizePerformance(tfidf_transformer, target, k, n): #target, num_cluster, n_component print("svd at its best score r =", n) svd = TruncatedSVD(n_components=n) #best score is 2 svd_data = svd.fit_transform(tfidf_transformer) km = KMeans(n_clusters=k, n_init=30, random_state=42).fit_predict(svd_data) pca = PCA(n_components=n).fit_transform(svd_data) plt.scatter(pca[:, 0], pca[:, 1], c=km) plt.show() print("nmf at its best score r = ", n) nmf = NMF( n_components=n, init='random', random_state=42) #best score is 3 from the contingency matrix [1][1] nmf_data = nmf.fit_transform(tfidf_transformer) km = KMeans(n_clusters=k, n_init=30, random_state=42).fit_predict(nmf_data) plt.scatter(nmf_data[:, 0], nmf_data[:, 1], c=km) plt.show() print('part b --------') # svd normalzed print('svd normalized') svd_norm = normalize(svd_data) kmeans = KMeans(n_clusters=k, n_init=30) km = kmeans.fit_predict(svd_norm) pca = PCA(n_components=n).fit_transform(svd_norm) plt.scatter(pca[:, 0], pca[:, 1], c=km) plt.show() print('contingency matrix: ') print(confusion_matrix(target, kmeans.labels_)) print_five_measures(target, kmeans.labels_) print('nmf normalzed') nmf_norm = normalize(nmf_data) kmeans = KMeans(n_clusters=k, n_init=30) km = kmeans.fit_predict(nmf_norm) plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km) plt.show() print('contingency matrix: ') print(confusion_matrix(target, kmeans.labels_)) print_five_measures(target, kmeans.labels_) #2nd bullet print('applying log transformation after NMF hear:') logTransform = FunctionTransformer( np.log1p) #(log10, log2, log1p, emath.log) => only log1p works lol nmf_log = logTransform.transform(nmf_data) kmeans = KMeans(n_clusters=k, n_init=30) km = kmeans.fit_predict(nmf_log) plt.scatter(nmf_log[:, 0], nmf_log[:, 1], c=km) plt.show() print('contingency matrix: ') print(confusion_matrix(target, kmeans.labels_)) print_five_measures(target, kmeans.labels_) # 3rd bullet print('log then norm') nmf_log = logTransform.transform(nmf_data) nmf_log_norm = normalize(nmf_log) kmeans = KMeans(n_clusters=k, n_init=30) km = kmeans.fit_predict(nmf_norm) plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km) plt.show() print('contingency matrix: ') print(confusion_matrix(target, kmeans.labels_)) print_five_measures(target, kmeans.labels_) print('norm then log') nmf_norm = normalize(nmf_data) nmf_norm_log = logTransform.transform(nmf_norm) kmeans = KMeans(n_clusters=k, n_init=30) km = kmeans.fit_predict(nmf_norm) plt.scatter(nmf_norm[:, 0], nmf_norm[:, 1], c=km) plt.show() print('contingency matrix: ') print(confusion_matrix(target, kmeans.labels_)) print_five_measures(target, kmeans.labels_)
ax.scatter(xx, yy) ax.grid() ax.set_title('Исходная информация') ax.set_xlabel('первый признак') ax.set_ylabel('второй признак') U, S, VT = svd(X) print("Левые сингулярные векторы:") #столбец - это вектор print(U) print("Матрица сингулярных значений:") #сингулярные числа - на диагонали print(np.diag(S)) print("Правые сингулярные векторы:") #строка - это вектор print(VT) svd = TruncatedSVD(n_components=dec_feat) X_transf = svd.fit_transform(X) #выделяет наиболее информативные признаки print('Сколько процентов объясняет каждый признак') print(svd.explained_variance_ratio_) print('Сколько процентов объясняет уменьшенная матрица') print(svd.explained_variance_ratio_.sum()) print("Преобразованная матрица после SVD:") print(X_transf) if dec_feat == 3: xx = X_transf[:, 0] yy = X_transf[:, 1] zz = X_transf[:, 2] fig = plt.figure(3, figsize=(6, 5)) ax = fig.add_subplot(111, projection='3d') ax.scatter(xx, yy, zz)