def tsne(): corpus = load_hobbies() docs = TfidfVectorizer().fit_transform(corpus.data) oz = TSNEVisualizer(ax=newfig()) oz.fit(docs, corpus.target) savefig(oz, "corpus_tsne")
def perform_tsne(X, Y, vec=None, outpath="", clusterLabels=False, savePlot=False): if vec == None: vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) docs = vec.fit_transform(X) labels = Y # from yellowbrick.text import TSNEVisualizer tsne = TSNEVisualizer() if clusterLabels: tsne.fit(docs, ["c{}".format(c) for c in Y]) # where Y=clusters.labels_ else: tsne.fit(docs, labels) if savePlot: # tsne.finalize() tsne.poof(outpath=outpath) else: tsne.poof()
def clusters_tsne(self, labels: pd.Series, title: str = 'title'): tsne = TSNEVisualizer(random_state=42) tsne.fit(self.vectors, labels) f = tsne.show().figure f.set_figheight(15) f.set_figwidth(15) f.suptitle(title) return f
def tsne_visualization(dataset_object, num_examples): """ Produce and save T-SNE visualization of feature vectors for given dataset Parameters ---------- dataset_object: tv_net.dataset.Dataset dataset object containing feature vectors and class names num_examples: int number of examples to plot """ dataset_object.shuffle_examples( ) # shuffle so that we don't get all one class feature_vectors = np.array( [item.feature_vector for item in dataset_object.items[:num_examples]]) label_list = [ item.class_name for item in dataset_object.items[:num_examples] ] title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format( num_examples) tsne = TSNEVisualizer(colormap='rainbow', title=title) tsne.fit(feature_vectors, label_list) output_path = os.path.join(dataset_object.config.OUTPUT_DIR, 'visualizations', 'feature_vector_tsne.png') tsne.show(outpath=output_path) tsne.show() # have to repeat to show and save
def text_cluster_tsne(text_vector, TextVectorizer=TfidfVectorizer, text_kwargs=text_kwargs, n_clusters=10, labels=None): '''Uses a TextVectorizer to transform the text contained (at the sentence or paragraph level) in the text_vector arg to produce a TSNE visualization. The label for the final plot is clusters produced from KMeans if labels are not passed. ARGS: text_vector <np.array>: Vector of text units. Must be type str. KWARGS: TextVectorizer <sklearn.feature_extraction.text>: Transformer. text_kwargs <dict>: kwargs to pass to TextVectorizer n_clusters <int>: If not using labels, number of clusters in KMeans labels <np.array>: True categorical labels. Discrete. RETURNS: None, prints visualizations to the console. ''' txt_vctzr = TextVectorizer(**text_kwargs) docs = txt_vctzr.fit_transform(text_vector) tsne = TSNEVisualizer() if labels is None: # derive clusters if labels not provided clusters = KMeans(n_clusters=n_clusters) clusters.fit(docs) tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_]) else: # otherwise use labels tsne.fit(docs, labels) sns.despine() tsne.poof()
def visualize_yellowbrick(dim_reduction, encoding, corpus_data, corpus_target, labels=True, alpha=0.7, metric=None): # https://pypi.org/project/yellowbrick/ # https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples # https://medium.com/@sangarshananveera/rapid-text-visualization-with-yellowbrick-51d3499c9333 if 'tfidf' in encoding.lower(): encode = TfidfVectorizer() if 'count' in encoding.lower(): encode = CountVectorizer() docs = encode.fit_transform(corpus_data) if labels is True: labels = corpus_target else: labels = None if 'umap' in dim_reduction.lower(): if metric is None: viz = UMAPVisualizer() else: viz = UMAPVisualizer(metric=metric) if 't-sne' in dim_reduction.lower(): viz = TSNEVisualizer(alpha=alpha) viz.fit(docs, labels) return viz.poof()
def result(self): data_df = self.clean_data.data() all_data_df = self.clean_data.getSpambase_data() target_df = self.clean_data.target() # Defining Model model = TSNE(learning_rate=100) # Fitting Model transformed = model.fit_transform(all_data_df) # Plotting 2d t-Sne x_axis = transformed[:, 0] pprint.pprint(x_axis) y_axis = transformed[:, 1] pprint.pprint(y_axis) plt.scatter(x_axis, y_axis, c=target_df) #plt.show() plt.savefig(self.file_png) # Create the visualizer and draw the vectors tfidf = TfidfVectorizer() docs = tfidf.fit_transform(data_df) tsne = TSNEVisualizer() tsne.fit(docs, target_df) tsne.poof()
def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]): ''' Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization. ''' print( '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}' .format(num_clusters)) for k in num_clusters: start = datetime.now() svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced = lsa.fit_transform(tfidf) # next, apply kmeans to the corpus to get labels clusters = KMeans(n_clusters=k, init='k-means++') clusters.fit(reduced) tsne = TSNEVisualizer(decompose=None) tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_]) tsne.finalize() filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str( k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png' plt.savefig(filename) plt.close() end = datetime.now() print(' ' + filename) print(" Time taken: {}".format(end - start))
def update_justifications_data(text, assignment_name, rating_type): if(assignment_name is None): return None global db db.close() db.connect() if(text=='Decision Point'): all_justifications, title, custom_stopwords = get_dp_justifications(assignment_name) elif(text=='Innovation Ratings'): if(rating_type is None): return None assignment = assignment_name.split()[0] pre_post = assignment_name.split()[1].split('-')[0] all_justifications, title, custom_stopwords = get_ratings_justifications(assignment, pre_post, rating_type) custom_stopwords.extend(['wa', 'could', 'also', 'would', 'ha', 'i', 'p', 'g', 'this', 'the']) all_justifications = all_justifications[ [True if len(x)>2 else False for x in all_justifications['justification']] ] all_justifications['justification'].replace('', np.nan, inplace=True) all_justifications.dropna(inplace=True) all_justifications['original'] = all_justifications['justification'] all_justifications['justification'] = all_justifications['justification'].apply(lambda x: lemmatize(x)) all_justifications['sentiment'] = all_justifications['justification'].apply(lambda x: calcParagraphSentiment(x)) tfidf = TfidfVectorizer(stop_words=custom_stopwords) docs = tfidf.fit_transform(list(all_justifications['justification'])) tsne = TSNEVisualizer(random_state=14) transformer = tsne.make_transformer() data = transformer.fit_transform(docs) all_justifications['x'] = data[:, 0] all_justifications['y'] = data[:, 1] return {'data': all_justifications.to_dict(), 'title': title, 'custom_stopwords': custom_stopwords}
def tsne_pack(c, l): my_title = "t-SNE Plot of " + c + " feature" data = df.filter(like=c) tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof()
def tsne(docs, target, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Visualize the frequency distribution visualizer = TSNEVisualizer(ax=ax, **kwargs) visualizer.fit(docs, target) visualizer.poof(outpath=outpath)
def plot_tsne_clusters(corpus, fileids=None, labels=None): from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer words = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (sent for title in normalizer.transform(words) for sent in title) # normed = (dd for dd in normalizer.transform(docs)) tfidf = TfidfVectorizer() procd = tfidf.fit_transform(normed) tsne = TSNEVisualizer() if labels is None: tsne.fit(procd) else: tsne.fit(procd, ["c{}".format(c) for c in labels]) tsne.poof()
def analyse_2_step_model(): X_test = np.load( "test_prepared.npy").item() # this is our Single point of truth #test_silhouette(30, X_test) test = X_test[0:1000] prediction = test_entire_model()[0:1000] vis_shilouette(test, prediction) plt.savefig("silhouette.png") tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction)))) tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction]) tsne.poof(outpath="tsne.png")
def tsne(c, l): my_title = "t-SNE Plot of final model" data = c tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof() # %%time figure(figsize=(20, 10)) tsne(final, label_bias3) # %%time figure(figsize=(20, 10)) tsne(final, label_fact)
def tsne(ax, classes=True): from sklearn.feature_extraction.text import TfidfVectorizer from yellowbrick.text import TSNEVisualizer X, y = load_data("hobbies", text=True) if not classes: y = None freq = TfidfVectorizer(input='filename', stop_words='english') X = freq.fit_transform(X) visualizer = TSNEVisualizer(ax=ax) visualizer.title = "t-SNE Projection of the Hobbies Corpus" if not classes: visualizer.title = "Unlabeled " + visualizer.title visualizer.fit(X, y) return visualizer
def generate_tsne(title, X, labels): fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2)) title_dic = {'fontsize': 7, 'fontweight': 'bold'} colors = resolve_colors(11, 'Spectral_r') colors2 = resolve_colors(10, 'BrBG_r') tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None) tsne.fit(X, labels) tsne.finalize() ax1 = tsne.ax ax1.set_title(title, title_dic) path = os.path.join(OUTPUT) filename = title filename = os.path.join(path, filename) plt.savefig(filename)
def cluster(corpus, k): y = [i[0] for i in corpus] corpus = [i[1] for i in corpus] eng = list(set(stopwords.words('english'))) trump = [ 'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build', 'thank', 'people' ] s_w = eng + trump vectorizer = TfidfVectorizer(stop_words=s_w) vectorizer.fit(corpus) features = vectorizer.transform(corpus) tsne = TSNEVisualizer() tsne.fit(features, y) tsne.show()
def analyse_results(): rerun = False if ("rerun" in sys.argv): print("Redo everything") rerun = True X_test = np.load("test_prepared.npy").item() results = [] names = [] for filename in os.listdir("results"): if filename.endswith(".npy"): if filename[:-4] + "tsne.png" in os.listdir( "results") and not rerun: continue results.append(np.load("results/" + filename)) names.append(filename[:-4]) for i in range(len(results)): print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " + names[i]) vis_shilouette(X_test, results[i]) plt.savefig("results/" + names[i] + "silhouette.png") plt.close() plt.figure() tsne = TSNEVisualizer(colormap=cm.get_cmap( 'jet', len(set(results[i][0:5000]))), alpha=0.5, random_state=45) # make it deterministic tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]]) tsne.poof(outpath="results/" + names[i] + "tsne.png", clear_figure=True)
def tsne_plot(self, outpath, sample_size=1000, tfidf=True): """ Creates a png file at `outpath` with t-SNE visualization. `sample_size` determines the size of the random sample from each label. Uses TfidfVectorizer by default; if `tfidf` is set to False, CountVectorizer is used. ----------------------------------------------------------------------- More info: https://www.scikit-yb.org/en/latest/api/text/tsne.html https://lvdmaaten.github.io/tsne/ """ if self.tokenizer is None: print('No tokenizer was loaded.') return None df = pd.DataFrame(columns=self.data.columns) for label in self.labels: samp_df = self.data \ .query("Label == @label") \ .sample(sample_size, random_state=19) df = df.append(samp_df, ignore_index=True) # vectorize if tfidf: vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize) else: vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize) X = vectorizer.fit_transform(df.Text) y = df.Label # create the visualizer and draw the vectors tsne = TSNEVisualizer() tsne.fit(X, y) tsne.show(outpath=outpath) return None
from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False) dummies = vect.fit_transform(df['ingredients'].apply(','.join)) df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names()) print("Vocab Length: ", len(vect.get_feature_names())) print("All Data Shape: ", df.shape) df.index= df_index print("Number of Predictors: ", df.shape[0]) df.head() # Create the visualizer and draw the vectors plt.figure(figsize = [15,9]) tsne = TSNEVisualizer() tsne.fit(df.loc[traindex,:][:7000], y[:7000]) tsne.poof() X = df.loc[traindex,:] print("Number of Cuisine Types: ", y.nunique()) print("X Shape: ", X.shape) test_df = df.loc[testdex,:] print("Test DF Shape: ", test_df.shape) del df; gc.collect(); LogisticRegression().get_params().keys() model = LogisticRegression(multi_class= 'ovr') score = cross_validate(model, X, y, return_train_score=False) score["test_score"].mean()
# freq_dist_viz(vectorizer, df_train['Lyrics'], # "images/tfid_stopwords_train.png") # freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png") def get_sentence_embedding(w2v_model, sentence): embedding = np.zeros(3000) for word in sentence.split(): try: vector = w2v_model.wv.get_vector(word) except KeyError: vector = np.zeros(3000) embedding += vector return embedding / len(sentence.split()) w2v_model = Word2Vec.load("word2vec_models/word2vec.model") docs = np.array([ get_sentence_embedding(w2v_model, sentence) for sentence in df_train['Lyrics'] ]) # tfidf = TfidfVectorizer() # docs = tfidf.fit_transform(X) labels = df_train['Genre'] tsne = TSNEVisualizer() tsne.fit(docs, labels) tsne.poof("images/w2v_tsne.png")
#!/usr/bin/env python3 import pickle from yellowbrick.text import TSNEVisualizer with open('data/agorb.csv', 'rb') as file: agora = pickle.load(file) with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file: X = pickle.load(file) with open('data/tno/categorieen.pkl', 'rb') as file: c = pickle.load(file) tsne = TSNEVisualizer() tsne.fit(X, c) tsne.show()
def main(X_train_smart, X_test_smart, y_train_smart, y_test_smart, X_train_bank, X_test_bank, y_train_bank, y_test_bank, args): # em = KMeans(n_clusters=4, random_state=27) # em.fit(X_train_smart) # prediction = em.predict(X_train_smart) # viz = RadViz() # viz.fit_transform(X_train_smart, prediction) # viz.show() # umap = UMAPVisualizer() # umap.fit(X_train_smart, ["c{}".format(c) for c in prediction]) # umap.show() # tsne = TSNEVisualizer(decompose_by=4) # tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction]) # tsne.show() # exit() sil_score_list_smart = [] cal_har_score_list_smart = [] davies_bouldin_score_list_smart = [] sil_score_list_bank = [] cal_har_score_list_bank = [] davies_bouldin_score_list_bank = [] num_clusters_list = np.arange(2, 25) for num_clusters in num_clusters_list: k_means = KMeans(n_clusters=num_clusters, random_state=27) k_means.fit(X_train_smart) prediction = k_means.predict(X_train_smart) # print(prediction) sil_score_list_smart.append(silhouette_score(X_train_smart, prediction)) cal_har_score_list_smart.append( calinski_harabasz_score(X_train_smart, prediction)) davies_bouldin_score_list_smart.append( davies_bouldin_score(X_train_smart, prediction)) for num_clusters in num_clusters_list: k_means = KMeans(n_clusters=num_clusters, random_state=27) k_means.fit(X_train_bank) prediction = k_means.predict(X_train_bank) # print(prediction) sil_score_list_bank.append(silhouette_score(X_train_bank, prediction)) cal_har_score_list_bank.append( calinski_harabasz_score(X_train_bank, prediction)) davies_bouldin_score_list_bank.append( davies_bouldin_score(X_train_bank, prediction)) with open('experiment_best.json') as f: params = json.load(f) if args.dimensionality is None: num_clusters_smart = params['k_means']['smart'] num_clusters_bank = params['k_means']['bank'] else: num_clusters_smart = params[args.dimensionality[0]]['k_means']['smart'] num_clusters_bank = params[args.dimensionality[0]]['k_means']['bank'] # Scale these for plotting cal_har_score_list_smart = [x / 500 for x in cal_har_score_list_smart] cal_har_score_list_bank = [x / 500 for x in cal_har_score_list_bank] davies_bouldin_score_list_smart = [ x / 5 for x in davies_bouldin_score_list_smart ] davies_bouldin_score_list_bank = [ x / 5 for x in davies_bouldin_score_list_bank ] plt.rc("font", size=8) plt.rc("axes", titlesize=12) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=8) plt.rc("figure", titlesize=11) #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) fig, ax = plt.subplots(1, 4, figsize=(15, 4)) fig.suptitle( 'K-Means Clusters - # of clusters Analysis (Left: Smart Grid, Right: Bank Loan)', fontsize=14) # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1) # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1) ax[0].plot(num_clusters_list, sil_score_list_smart, 'b-', label='Silhouette', linewidth=1) ax[0].plot(num_clusters_list, cal_har_score_list_smart, 'r--', label='Calinksi-Harabasz / 500', linewidth=1) ax[0].plot(num_clusters_list, davies_bouldin_score_list_smart, 'g-.', label='Davies-Bouldin / 5', linewidth=1) ax[0].set(xlabel='K (# of clusters)', ylabel='Scores') ax[0].set_title('Clustering Scores') ax[0].legend() k_means = KMeans(n_clusters=num_clusters_smart, random_state=27) k_means.fit(X_train_smart) prediction_smart = k_means.predict(X_train_smart) tsne = TSNEVisualizer(decompose_by=X_train_smart.shape[1] - 1, ax=ax[1], random_state=27) tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction_smart]) ax[1].set_title( 'tSNE Projection (clusters = {0})'.format(num_clusters_smart)) ax[1].set_xticklabels([]) ax[1].set_yticklabels([]) ax[2].plot(num_clusters_list, sil_score_list_bank, 'b-', label='Silhouette', linewidth=1) ax[2].plot(num_clusters_list, cal_har_score_list_bank, 'r--', label='Calinksi-Harabasz / 5d00', linewidth=1) ax[2].plot(num_clusters_list, davies_bouldin_score_list_bank, 'g-.', label='Davies-Bouldin / 5', linewidth=1) ax[2].set(xlabel='K (# of clusters)', ylabel='Scores') ax[2].set_title('Clustering Scores') ax[2].legend() k_means = KMeans(n_clusters=num_clusters_bank, random_state=27) k_means.fit(X_train_bank) prediction_bank = k_means.predict(X_train_bank) tsne_bank = TSNEVisualizer(decompose_by=X_train_bank.shape[1] - 1, ax=ax[3], random_state=27) tsne_bank.fit(X_train_bank, ["c{}".format(c) for c in prediction_bank]) ax[3].set_title( 'tSNE Projection (clusters = {0})'.format(num_clusters_bank)) ax[3].set_xticklabels([]) ax[3].set_yticklabels([]) plt.show() # Boosting validation # Smart grid boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_smart, y_train_smart) boost_fit_time = time() - boost_fit_t print('Boosting baseline fit time (smart): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_smart) boost_pred_time = time() - boost_pred_t print('Boosting baseline predict time (smart): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_smart, y_train_smart, cv=10) print('Boosting baseline cross validation score (smart): ' + str(np.mean(boost_score))) # boost_accuracy = accuracy(boosting_learner, y_test, boost_pred) # print('Boosting baseline test set predict accuracy: ' + str(boost_accuracy)) boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_smart, prediction_smart) boost_fit_time = time() - boost_fit_t print('Boosting DR + cluster fit time (smart): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_smart) boost_pred_time = time() - boost_pred_t print('Boosting DR + cluster predict time (smart): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_smart, prediction_smart, cv=10) print('Boosting DR + cluster cross validation score (smart): ' + str(np.mean(boost_score))) # Bank loan boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_bank, y_train_bank) boost_fit_time = time() - boost_fit_t print('Boosting baseline fit time (bank): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_bank) boost_pred_time = time() - boost_pred_t print('Boosting baseline predict time (bank): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_bank, y_train_bank, cv=10) print('Boosting baseline cross validation score (bank): ' + str(np.mean(boost_score))) boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_bank, prediction_bank) boost_fit_time = time() - boost_fit_t print('Boosting DR + cluster fit time (bank): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_bank) boost_pred_time = time() - boost_pred_t print('Boosting DR + cluster predict time (bank): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_bank, prediction_bank, cv=10) print('Boosting DR + cluster cross validation score (bank): ' + str(np.mean(boost_score))) return
def load_corpus(): c = Corpus("all_posts01.txt") return c corpus = load_corpus() #tfidf = TfidfVectorizer(stop_words='english') from sklearn.cluster import KMeans vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, use_idf=True) #transformer = TfidfTransformer() #tfidf = make_pipeline(hasher,transformer) docs = vectorizer.fit_transform(corpus.documents) print(docs) true_k = 500 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(docs) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() tsne = TSNEVisualizer(labels=["documents"]) tsne.fit(docs) tsne.poof()
cbar=False, fmt='g') ####################visualisng Clusters ###########Dendogram for TF-IDF features from scipy.cluster.hierarchy import dendrogram, linkage np.set_printoptions(precision=6, suppress=True) H_cluster = linkage(tfidf_matrix, 'ward') plt.title('Dendogram') plt.xlabel('Data') plt.ylabel('Distance bewteen data points') dendrogram( H_cluster, truncate_mode='lastp', # show only the last p merged clusters p=13, # show only the last p merged clusters leaf_rotation=90., leaf_font_size=12., show_contracted= True, # to get a distribution impression in truncated branches ) plt.show() #########Scatter plot to visualise k-means clusters from yellowbrick.text import TSNEVisualizer tsne = TSNEVisualizer() tsne.fit(tfidf_matrix, ["c{}".format(c) for c in labels]) tsne.poof()
color = "#000000" colormap.append(color) for label in labels: big_colormap.append(mycolormap[label]) t6 = time.time() tsne = TSNEVisualizer(colormap='RdYlGn') tsne.fit(tfidf_matrix, labels) tsne.poof() t7 = time.time() print("time for TSNE and vis: " + str(t7-t6)) tsne.poof()
categories=categories, files=files, data=data, target=target, ) # Load the data and create document vectors corpus = load_corpus('hobbies') tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) labels = corpus.target # Create a visualizer to simply see the vectors plotted in 2D tsne = TSNEVisualizer() tsne.fit(docs) tsne.poof() # Create a visualizer to see how k-means clustering grouped the docs from sklearn.cluster import KMeans clusters = KMeans(n_clusters=5) clusters.fit(docs) tsne = TSNEVisualizer() tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_]) tsne.poof()
liste_galaxies = get_list_galaxie(path) matrix = np.zeros([len(t), len(liste_galaxies)]) dirGalaxies = shelve.open(path + '/BDs/listeGalaxies') for galaxie in range(len(liste_galaxies)): for node in dirGalaxies[str(liste_galaxies[galaxie])]: matrix[index[node]][galaxie] += 1 matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])]) dirGalaxies.close() label = np.array([i for i in range(len(t))]) tsne = TSNEVisualizer(decompose='svd',decompose_by=15) tsne.fit(matrix, label) print(tsne.transformer_) tsne.poof() svd = TruncatedSVD(n_components=15) svd_matrix = svd.fit_transform(matrix) tsne = ts.TSNE() y = tsne.fit_transform(svd_matrix) kmeans = Kmeans(5,200,0.1) kmeans.fit(y) for i in range(kmeans.nb_cluster): print("Cluster ",i) print((np.where(kmeans.which_cluster == i))[0]) print() plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis')
def initialization_layers_train(self, train_path, test_path, dataset, no_of_layers, filter_sizes): global n1, n2 # y_pred_cl=tf.get_variable("y_pred_cl",0) x, x_image, y_true, y_true_cls = self.load_main(train_path, dataset, 0) self.load_main(test_path, dataset, 1) # print(len(x_image)) # y_true_cls=tf.Variable(y_true_cls) if dataset == "Fashion-MNIST": layer_conv1, weights_conv1 = self.new_conv_layer( input=x_image, num_input_channels=1, filter_size=filter_sizes[0], num_filters=64, name="conv1") elif dataset == "CIFAR-10": layer_conv1, weights_conv1 = self.new_conv_layer( input=x_image, num_input_channels=3, filter_size=filter_sizes[0], num_filters=64, name="conv1") layer_pool1 = self.new_pool_layer(layer_conv1, name="pool1") layer_pool1 = tf.nn.local_response_normalization(layer_pool1) layer_relu1 = self.new_relu_layer(layer_pool1, name="relu1") layer_pools = [] layer_relus = [] layer_convs = [] weight_convs = [] layer_convs.append(layer_conv1) layer_pools.append(layer_pool1) layer_relus.append(layer_relu1) weight_convs.append(weights_conv1) n2 = 5 for k1 in range(1, no_of_layers): namee = "conv" + str(k1 + 1) layer_conv1, weights_conv1 = self.new_conv_layer( input=layer_relus[k1 - 1], num_input_channels=64, filter_size=filter_sizes[k1], num_filters=64, name=namee) name2 = "pool" + str(k1 + 1) name1 = "relu" + str(k1 + 1) layer_pool1 = self.new_pool_layer(layer_conv1, name=name2) layer_pool1 = tf.nn.local_response_normalization(layer_pool1) layer_relu1 = self.new_relu_layer(layer_pool1, name=name1) layer_convs.append(layer_conv1) layer_pools.append(layer_pool1) layer_relus.append(layer_relu1) weight_convs.append(weights_conv1) n2 = 10 num_features = layer_relu1.get_shape()[1:4].num_elements() layer_flat = tf.reshape(layer_relu1, [-1, num_features]) layer_fc1 = self.new_fc_layer(layer_flat, num_inputs=num_features, num_outputs=512, name="fc1") layer_relu4 = self.new_relu_layer(layer_fc1, name="relu" + str(no_of_layers + 1)) layer_fc3 = self.new_fc_layer(layer_relu4, num_inputs=512, num_outputs=192, name="fc3") layer_relu3 = self.new_relu_layer(layer_fc3, name="relu" + str(no_of_layers + 2)) layer_fc2 = self.new_fc_layer(input=layer_relu3, num_inputs=192, num_outputs=10, name="fc2") with tf.variable_scope("Softmax"): y_pred = (tf.nn.softmax(layer_fc2)) y_pred_cls = tf.argmax(y_pred, dimension=1) # y_pred_cl=y_pred_cls with tf.name_scope("cross_ent"): cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=layer_fc2, labels=y_true) cost = tf.reduce_mean(cross_entropy) with tf.name_scope("optimizer"): optimizer = tf.train.AdamOptimizer( learning_rate=1e-3).minimize(cost) with tf.variable_scope("accuracy"): # print("-----------") # print(y_pred_cls) correct_prediction = tf.equal(y_pred_cls, y_true_cls) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) num_epochs = 20 batch_size = 100 trainset = self.trainset testset = self.testset trainlabel = self.trainlabel testlabel = self.testlabel f = open("CF_allembeddings.txt", "w") tsne = TSNEVisualizer() with tf.Session() as sess: global train_acc global f1_mi global f1_ma train_acc = [] f1_mi = [] f1_ma = [] sess.run(tf.global_variables_initializer()) trainset1 = self.trainset trainlabels11 = self.train_labels # print(len(trainset1)) trainlabel1 = self.trainlabel for i in range(0, 4): f.write("\n\n Percentage of traindata ::: ") f.write(str((1 + i) * 10)) f.write("\n") # indx = np.arange(trainlabel1.shape[0]) # np.random.shuffle(indx) # trainset,trainlabel = trainset1[indx], trainlabel1[indx] # trainset1, trainlabel1=trainset1[indx], trainlabel1[indx] # trainlabels1=trainlabels11[indx] n = len(trainset1) n = int(n * (i + 1) / 10) # print(n) trainset, trainlabel = trainset1, trainlabel1 trainlabels1 = trainlabels11 testset, testlabel = trainset[n:, :], trainlabel[n:, :] testlabels1 = trainlabels11[n:, :] # print(len(testlabel)) trainset, trainlabel = trainset[0:n, :], trainlabel[0:n, :] trainlabels1 = trainlabels1[0:n, :] print("----------------------------------------------") # print(len(trainlabel1),len(trainset),len(testset)) # testset,testlabel =trainset1[n:,:],trainlabel1[n:,:] # print(trainlabel) # writer.add_graph(sess.graph) # print("testlabel:",str(testlabels1[4])) # print("testsset",str(testset[4])) # print("testlabel",str(testlabel[4])) # print("set",str(trainset1[n+4])) # print("lab",str(trainlabel1[n+4])) # break for epoch in range(num_epochs): start_time = time.time() train_accuracy = 0 k = 0 f1_macro = 0 f1_micro = 0 a1 = 0 # batch_size=int(len(trainlabel)) for batch in range(0, int(len(trainlabel) / batch_size)): # print("--") x_batch = trainset[k:k + batch_size] y_true_batch = trainlabel[k:k + batch_size] k = k + batch_size feed_dict_train = {x: x_batch, y_true: y_true_batch} sess.run(optimizer, feed_dict=feed_dict_train) a, y_p, y, t = sess.run( [accuracy, y_pred_cls, y_true_cls, layer_relu3], feed_dict=feed_dict_train) a1 += a # a1+=a print("acc :", str(epoch), "---", str(a1 / int(len(trainlabel) / batch_size))) vali_accuracy, l = sess.run([accuracy, layer_relu3], feed_dict={ x: testset, y_true: testlabel }) # print(np.array(l)) kmeans = KMeans(n_clusters=10, random_state=0).fit(np.array(l)) # yy=kmeans.predict(np.array(l)) # plt.scatter() labels = kmeans.labels_ # print(labels) f.write("\n\nEMBEDDINGS ARE : \n\n") f.write(str(np.array(l))) f.write("\n\n") mydict = { i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters) } # print(mydict) # print(len(mydict[9])) # print(len(mydict[8])) # print(len(mydict[7])) # print(len(mydict[6])) dictlist = [] #print(mydict) mydict1 = {} for key in mydict: mydict1[key] = list(mydict[key]) # print(len(mydict[0])) cluster_label = {} acc = 0.0 # print(testlabels1[0][0]) lab = [i for i in range(10)] while (len(lab) != 0): #iidx=max(mydict.items(), key=operator.itemgetter(1))[0] #idx is the cluster no. iidx = max(mydict1, key=mydict1.get) cluster_label[iidx] = 0 keys1 = mydict1[iidx] # print(keys1) dict1 = {} print(keys1) for i in (keys1): if testlabels1[i][0] not in dict1: dict1[testlabels1[i][0]] = 0 dict1[testlabels1[i][0]] += 1 idx1 = max( dict1.items(), key=operator.itemgetter(1) )[0] #idx1 is the label of maximum occuring samples in idx cluste while (idx1 not in lab): del dict1[idx1] idx1 = max(dict1.items(), key=operator.itemgetter(1))[0] # print(dict1[idx1]) # print(dict1) #print(cluster_label) acc += (dict1[idx1]) cluster_label[iidx] = idx1 # print(idx1) lab.remove(idx1) del mydict1[iidx] print("Acc: ") print(acc) print(acc / len(testset)) f.write("\nACCURACY AFTER CLUSTERING IS: ") f.write(str(acc)) f.write("\n\n")
newdf_countvectorizer = vectorizer.fit_transform(newdf['newPreprocessed']) # The text has to be cleaned first. newdf_countvectorizer.shape print(vectorizer.get_feature_names()) print(len(vectorizer.get_feature_names())) """**Display TSNE**""" from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer data = newdf['newPreprocessed'] tfidf = TfidfVectorizer() docs = tfidf.fit_transform(data) labels = newdf['feedback'] tsne = TSNEVisualizer() tsne.fit_transform(docs, labels) tsne.poof() # show the distribution of negative and positive reviews newdf.drop(['reviews.text'], axis=1, inplace=True) reviews = pd.DataFrame(newdf_countvectorizer.toarray()) newdf.head(1) """**Set Feature X and Target Y**""" newdf.reset_index(drop=True, inplace=True) newdf = pd.concat([newdf, reviews], axis=1) X = newdf.drop(['reviews.rating','feedback','preprocessed','preprocessedStr','preprocessedStr','newPreprocessed','keepAdj','posTag'],axis=1) y = newdf['feedback']
# Load the data from the files in the corpus for cat in categories: for name in os.listdir(os.path.join(path, cat)): files.append(os.path.join(path, cat, name)) target.append(cat) with open(os.path.join(path, cat, name), 'r') as f: data.append(f.read()) # Return the data bunch for use similar to the newsgroups example return Bunch( categories=categories, files=files, data=data, target=target, ) # Load the data and create document vectors corpus = load_corpus('hobbies') tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) labels = corpus.target # Create the visualizer and draw the vectors tsne = TSNEVisualizer() tsne.fit(docs, labels) tsne.poof()