dirGalaxies = shelve.open(path + '/BDs/listeGalaxies') for galaxie in range(len(liste_galaxies)): for node in dirGalaxies[str(liste_galaxies[galaxie])]: matrix[index[node]][galaxie] += 1 matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])]) dirGalaxies.close() label = np.array([i for i in range(len(t))]) tsne = TSNEVisualizer(decompose='svd',decompose_by=15) tsne.fit(matrix, label) print(tsne.transformer_) tsne.poof() svd = TruncatedSVD(n_components=15) svd_matrix = svd.fit_transform(matrix) tsne = ts.TSNE() y = tsne.fit_transform(svd_matrix) kmeans = Kmeans(5,200,0.1) kmeans.fit(y) for i in range(kmeans.nb_cluster): print("Cluster ",i) print((np.where(kmeans.which_cluster == i))[0]) print() plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis') plt.title("Resultat du clustering") plt.savefig("clustering") plt.show()
newdf_countvectorizer.shape print(vectorizer.get_feature_names()) print(len(vectorizer.get_feature_names())) """**Display TSNE**""" from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer data = newdf['newPreprocessed'] tfidf = TfidfVectorizer() docs = tfidf.fit_transform(data) labels = newdf['feedback'] tsne = TSNEVisualizer() tsne.fit_transform(docs, labels) tsne.poof() # show the distribution of negative and positive reviews newdf.drop(['reviews.text'], axis=1, inplace=True) reviews = pd.DataFrame(newdf_countvectorizer.toarray()) newdf.head(1) """**Set Feature X and Target Y**""" newdf.reset_index(drop=True, inplace=True) newdf = pd.concat([newdf, reviews], axis=1) X = newdf.drop(['reviews.rating','feedback','preprocessed','preprocessedStr','preprocessedStr','newPreprocessed','keepAdj','posTag'],axis=1) y = newdf['feedback'] """*Split Test & Train Set*"""