예제 #1
0
    dirGalaxies = shelve.open(path + '/BDs/listeGalaxies')

    for galaxie in range(len(liste_galaxies)):
        for node in dirGalaxies[str(liste_galaxies[galaxie])]:
            matrix[index[node]][galaxie] += 1
        
        matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])])

    dirGalaxies.close()
    
    label = np.array([i for i in range(len(t))])
    tsne = TSNEVisualizer(decompose='svd',decompose_by=15)
    tsne.fit(matrix, label)
    print(tsne.transformer_)
    tsne.poof()

    svd = TruncatedSVD(n_components=15)
    svd_matrix = svd.fit_transform(matrix)
    tsne = ts.TSNE()
    y = tsne.fit_transform(svd_matrix)
    kmeans = Kmeans(5,200,0.1)
    kmeans.fit(y)
    for i in range(kmeans.nb_cluster):
        print("Cluster ",i)
        print((np.where(kmeans.which_cluster == i))[0])
        print()
    plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis')
    plt.title("Resultat du clustering")
    plt.savefig("clustering")
    plt.show()
예제 #2
0
newdf_countvectorizer.shape
print(vectorizer.get_feature_names())
print(len(vectorizer.get_feature_names()))

"""**Display TSNE**"""

from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer

data = newdf['newPreprocessed']
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(data)
labels = newdf['feedback']

tsne = TSNEVisualizer()
tsne.fit_transform(docs, labels)
tsne.poof() 
# show the distribution of negative and positive reviews

newdf.drop(['reviews.text'], axis=1, inplace=True) 
reviews = pd.DataFrame(newdf_countvectorizer.toarray())
newdf.head(1)

"""**Set Feature X and Target Y**"""

newdf.reset_index(drop=True, inplace=True)
newdf = pd.concat([newdf, reviews], axis=1)
X = newdf.drop(['reviews.rating','feedback','preprocessed','preprocessedStr','preprocessedStr','newPreprocessed','keepAdj','posTag'],axis=1)
y = newdf['feedback']

"""*Split Test & Train Set*"""