def mds(data, n_components=300): embedding = MDS(n_components=n_components) new_data = embedding.fit_transform(data) return new_data
from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter from sklearn.preprocessing import StandardScaler from sklearn.manifold import MDS import pandas as pd import matplotlib.pyplot as plt fruits = pd.read_table('../resources/fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] target_fruit_names = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] # each feature should be centered (zero mean) and with unit variance X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits) mds = MDS(n_components=2) X_fruits_mds = mds.fit_transform(X_fruits_normalized) plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon']) plt.xlabel('First MDS feature') plt.ylabel('Second MDS feature') plt.title('Fruit sample dataset MDS')
def mds(k, X): MDSmodel = MDS(n_components=k) mdsresult = MDSmodel.fit_transform(X) np.savetxt("MDS_out.csv", mdsresult, delimiter=",") return None
def vizualize2d(self, n_frac=0.01, b_annotations=False): n_components = 2 env = Environment() c = OpenCorpus() di_g = c.grammemes(mode=1) data = self.tokenz().sample(frac=n_frac) data = data.fillna(0) #print(data['idgram'].shape) #print(data.index.shape) tdf = pd.DataFrame(index=data.index) tdf['idgram'] = data['idgram'] tdf['gram'] = data['gram'] tdf['word'] = data['word'] #print(tdf) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] # , 'bgm_l_None' # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug( 1, ['POStagger', 'visualize2D', 'Drop colums: %s' % (drop_columns)]) data = data.drop(columns=drop_columns, axis=1) values = data.values X = values[:, 1:] y = values[:, 0] #print(data.head,X, y) #return 0 #Scalers sc = StandardScaler() min_max_scaler = preprocessing.MinMaxScaler() max_abs_scaler = preprocessing.MaxAbsScaler() #X = sc.fit_transform(X) #PCA b_pca = False b_sne = True if b_pca: model = PCA(n_components=n_components) if b_sne: model = MDS(n_components=n_components) #TSNE X_new = model.fit_transform(X, y) if b_pca: print('PCA ratio', n_components, 'components', model.explained_variance_ratio_) #X_new = sc.fit_transform(X_new) #X_new = preprocessing.scale(X_new) if b_pca: X_new = max_abs_scaler.fit_transform(X_new) #return 0 #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index) tdf['PC1'] = X_new[:, 0] tdf['PC2'] = X_new[:, 1] #finalDf = pd.concat([tdf, data[['idgram']]], axis=1) df_groups = tdf.groupby('idgram').count() #print(df_groups) #return 0 tdf['counts'] = 0 for index, serie in tdf.iterrows(): n_idgram = tdf.at[index, 'idgram'] tdf.at[index, 'counts'] = df_groups[df_groups.index == n_idgram]['gram'] tdf = tdf.sort_values(by=['counts'], ascending=False) #print(tdf) #Draw i = 0 N = df_groups.shape[0] s_title = '' if b_pca: s_title = '2 component PCA. Точность %s' % (round( sum(float(i) for i in model.explained_variance_ratio_), 2)) if b_sne: s_title = 't-SNE' #Plotly if False: #Plotly py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B') c = [ 'hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 360, N) ] data_trace = [] for index, row in df_groups.iterrows(): #print(index) df_trace = tdf[tdf['idgram'] == index] #print(df_trace) g_trace = go.Scatter( x=df_trace['PC1'].values, y=df_trace['PC2'].values, name=df_trace['gram'].values[0], mode='markers', #'markers+text' marker=dict( size=8, color=i, #c[i] opacity=0.8, colorscale='Viridis'), text=df_trace['word'], textfont=dict(family='sans serif', size=12)) data_trace.append(g_trace) i += 1 layout = go.Layout( title=s_title_pca, xaxis=dict( title=('Component 1. Вклад %s' % (round(pca.explained_variance_ratio_[0], 2)))), yaxis=dict( title=('Component 2. Вклад %s' % (round(pca.explained_variance_ratio_[1], 2))))) fig2 = go.Figure(data=data_trace, layout=layout) py.image.save_as(fig2, filename='c:/prj/mlivos_data/temp/Words2.png') #Bokeh if True: palette = d3['Category20'][len(tdf['gram'].unique())] #palette = all_palettes['Category20'][len(tdf['gram'].unique())] #palette = Viridis256[len(tdf['gram'].unique())] #palette = Viridis256 color_map = CategoricalColorMapper(factors=tdf['gram'].unique(), palette=palette) #print(mapper) fig = figure(title=s_title, toolbar_location=None) source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']]) fig.scatter(x='PC1', y='PC2', size=12, color={ 'field': 'gram', 'transform': color_map }, legend='gram', source=source) show(fig) export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png") return 0
def mds(data, dim=3): embedding = MDS(n_components=dim) result = embedding.fit_transform(data) return result
data = data.drop('Legendary', axis=1) accuracy = [] X = data.iloc[:, :] y = labels X = np.array(X) print("X shape: ", X.shape) transformer = FactorAnalysis(n_components=2, random_state=0) X_fa = transformer.fit_transform(X) fig = plt.figure(figsize=(10, 10)) plt.scatter(X_fa[:, 0], X_fa[:, 1]) plt.show() embedding = MDS(n_components=2) X_mds = embedding.fit_transform(X) fig = plt.figure(figsize=(10, 10)) plt.scatter(X_mds[:, 0], X_mds[:, 1]) plt.show() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5) x_train_fa, x_test_fa, y_train_fa, y_test_fa = train_test_split(X_fa, y, test_size=0.33, random_state=5) x_train_mds, x_test_mds, y_train_mds, y_test_mds = train_test_split(
def rank_graph(): MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] print() print() #set up colors per clusters using a dict cluster_colors = { 0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e' } #set up cluster names using a dict cluster_names = {0: 'Top', 1: 'Bottom', 2: '3', 3: '4', 4: '5'} #some ipython magic to show the matplotlib plots inline get_ipython().run_line_magic('matplotlib', 'inline') #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, title=titles, rank=ranks)) #group by cluster groups = df.groupby('rank') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for i in range(len(df)): ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8) plt.show() #show the plot
for i in range(np.sum(nbpc[:c1]), np.sum(nbpc[:c1 + 1])): for j in range(np.sum(nbpc[:c2]), np.sum(nbpc[:c2 + 1])): if rng.rand() <= P[c1, c2]: C[i, j] = 1 return C + C.T n = 100 nc = 3 ratio = np.array([.5, .3, .2]) P = np.array(0.6 * np.eye(3) + 0.05 * np.ones((3, 3))) C1 = get_sbm(n, nc, ratio, P) # get 2d position for nodes x1 = MDS(dissimilarity='precomputed', random_state=0).fit_transform(1 - C1) def plot_graph(x, C, color='C0', s=None): for j in range(C.shape[0]): for i in range(j): if C[i, j] > 0: pl.plot([x[i, 0], x[j, 0]], [x[i, 1], x[j, 1]], alpha=0.2, color='k') pl.scatter(x[:, 0], x[:, 1], c=color, s=s, zorder=10, edgecolors='k',
# Calculate JSD matrix without shock ddmat = jsd_matrix(odat, "dayhour") # Calculate JSD matrix with shock dmat = jsd_matrix(dis, "dayhour") # Check matrix print(dmat) np.save( '/home/server/pi/homes/woodilla/Projects/Anomalous-Detection-Browning-Simulation/data/jsd_mat.npy', dmat) #------------------------------------------------------------------------ # Metric-MDS 5-dimensions without shock nmds = MDS(n_components=5, metric=True, dissimilarity='precomputed') nmds_dat = nmds.fit_transform(ddmat) ndat = pd.DataFrame({"x": nmds_dat[:, 0], "y": nmds_dat[:, 1]}) ndat['x2'] = ndat['x'].shift(-1) ndat['y2'] = ndat['y'].shift(-1) ndat['distance'] = np.sqrt((ndat['x2'] - ndat['x'])**2 + (ndat['y2'] - ndat['y'])**2) # Calculate speed ndat['speed'] = ndat['distance'] / 1 # Plot speed plt = sns.scatterplot(x=range(len(ndat)), y=ndat['speed'], edgecolor="black")
plt.xlabel('PC1') plt.ylabel('PC2') plt.xticks(np.arange(-4, 6, 1)) plt.savefig("static/images/ScatterPlot_Stratified.png", dpi=300) print "Plotting PCA Scatter Plot Random" plt.figure(3, figsize=(12, 6)) ax = plt.subplot(111, facecolor='lightgray') ax.plot(x2, y2, 'o', markersize=8, color='blue', alpha=0.5) plt.title('PCA Scatter Plot Random') plt.xlabel('PC1') plt.ylabel('PC2') plt.xticks(np.arange(-4, 6, 1)) plt.savefig("static/images/ScatterPlot_Random.png", dpi=300) mds = MDS(n_components=2, dissimilarity="euclidean") results = mds.fit(X) coordsX = results.embedding_ print "Plotting MDS Scatter Plot Euclidean - Stratified" plt.figure(4, figsize=(12, 6)) ax = plt.subplot(111, facecolor='lightgray') plt.scatter(coordsX[:, 0], coordsX[:, 1], marker='o', color='blue') plt.title('MDS Scatter Plot Euclidean - Stratified') plt.savefig("static/images/MDS_Stratified_Euclidean.png", dpi=300) results = mds.fit(Y) coordsY = results.embedding_ print "Plotting MDS Scatter Plot Euclidean - Random" plt.figure(5, figsize=(12, 6))
def NMDS_Plot(df): outFile = sys.argv[1] + '.NMDS.pdf' Treatment = [] Samples = list(df.Samples) Treatment = [s[:2] for s in Samples] # for index, row in df.iterrows(): # Treatment.append(row['Samples'][:2]) df['Treatment'] = Treatment tcount = list(df.groupby('Treatment').count().Samples) t = list(df.Treatment.unique()) tt = list(zip(t, tcount)) print(t) print(tcount) del df['Samples'] del df['Treatment'] m = ['o', 'v', '^', 's', 'p', 'P', '*', 'X', 'd', 'x'] c = [ '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a' ] seed = np.random.RandomState(seed=3) nmds = MDS(n_components=2, metric=False, max_iter=10000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=10000) npos = nmds.fit_transform(df) distances = [] for i in range(len(npos) - 1): j = i + 1 distances.append( np.sqrt((npos[j, 0] - npos[i, 0])**2 + (npos[j, 1] - npos[i, 1])**2)) stress = np.sqrt(nmds.stress_ / sum(distances)**2) stext = 'Stress = %f' % (stress) df2 = pd.DataFrame({'Sample': Samples, 'X': npos[:, 0], 'Y': npos[:, 1]}) df_file = sys.argv[1] + '_NMDS_DataFrame.tsv' df2.to_csv(df_file, sep='\t') a = 0 for i, j in enumerate(tt): b = a + j[1] plt.scatter(npos[a:b, 0], npos[a:b, 1], c=c[i], marker=m[i], label=j[0]) a += j[1] plt.subplots_adjust(right=0.7) ax = plt.gca() plt.axis('equal') plt.title('NMDS plot of Mash Distance') plt.legend(frameon=False, bbox_to_anchor=(1.04, 1.05), loc="upper left") plt.text(1, .010, stext, fontsize=10, color='#737373', horizontalalignment='right', transform=ax.transAxes) plt.savefig(outFile) plt.close()
def plot_clusters(num_clusters, feature_matrix, cluster_data, book_data, plot_size=(16, 8)): # generate random color for clusters def generate_random_color(): color = '#%06x' % random.randint(0, 0xFFFFFF) return color # define markers for clusters markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd'] # build cosine distance matrix cosine_distance = 1 - cosine_similarity(feature_matrix) # dimensionality reduction using MDS mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # get coordinates of clusters in new low-dimensional space plot_positions = mds.fit_transform(cosine_distance) x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1] # build cluster plotting data cluster_color_map = {} cluster_name_map = {} for cluster_num, cluster_details in cluster_data[:].items(): # assign cluster features to unique label cluster_color_map[cluster_num] = generate_random_color() cluster_name_map[cluster_num] = ', '.join( cluster_details['key_features'][:5]).strip() # map each unique cluster label with its coordinates and books cluster_plot_frame = pd.DataFrame({ 'x': x_pos, 'y': y_pos, 'label': book_data['Cluster'].values.tolist(), 'title': book_data['title'].values.tolist() }) grouped_plot_frame = cluster_plot_frame.groupby('label') # set plot figure size and axes fig, ax = plt.subplots(figsize=plot_size) ax.margins(0.05) # plot each cluster using co-ordinates and book titles for cluster_num, cluster_frame in grouped_plot_frame: marker = markers[cluster_num] if cluster_num < len(markers) \ else np.random.choice(markers, size=1)[0] ax.plot(cluster_frame['x'], cluster_frame['y'], marker=marker, linestyle='', ms=12, label=cluster_name_map[cluster_num], color=cluster_color_map[cluster_num], mec='none') ax.set_aspect('auto') ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off') fontP = FontProperties() fontP.set_size('small') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=True, ncol=5, numpoints=1, prop=fontP) # add labels as the film titles for index in range(len(cluster_plot_frame)): ax.text(cluster_plot_frame.ix[index]['x'], cluster_plot_frame.ix[index]['y'], cluster_plot_frame.ix[index]['title'], size=8) # show the plot plt.show()
for member in temp: member.append(1) reslist.extend(temp) return reslist # 警告! # 该脚本用于生成操作向量的嵌入 因为操作是一个[10,11]的向量 分别表示10帧 11个按键控制(1和0分别表示按下或者没按下) # 采用sklearn的多维度缩放来将[10,15]转换成[10,1000] 相当于操作向量嵌入作为outputEmbedding # 因为MDS具有随机性 所以如果不想从头训练 不要运行这个脚本 if __name__ == "__main__": opVec = genOnehot(11) opDict = {} for i in range(len(opVec)): opDict[i] = list(opVec[i]) print(opDict) embedding = MDS(n_components=1000) opVec_embedding = embedding.fit_transform(opVec) op_embedding_Dict = {} for i in range(len(opVec_embedding)): op_embedding_Dict[i] = list(opVec_embedding[i]) print(op_embedding_Dict) op_embedding_Dict_js = json.dumps(op_embedding_Dict, indent=1) opDict_js = json.dumps(opDict, indent=1) f = open('../config/embedding.txt', 'w') f.write(op_embedding_Dict_js) f.close() f = open('../config/opDict.txt', 'w') f.write(opDict_js) f.close()
drawer.FinishDrawing() svg = drawer.GetDrawingText() display(SVG(svg.replace("svg:", ""))) # ## Different lenses # Here I am going to try to do a multidimensional scaling analysis of the distance data to utal the most important dimesions. This requires a bit of thinking on what the MDS actually outputs, and how it relates to the physical features of chemical space. # # We must use a nonlinear MDS because $ 1 - T_C $ is not necessarily positive semidefinite. # In[22]: from sklearn.manifold import MDS print(sq_distance_matrix.shape) transformed_data = MDS(n_components=2, dissimilarity="precomputed", metric=False).fit_transform(sq_distance_matrix) transformed_data # In[25]: plt.scatter(transformed_data[:, 0], transformed_data[:, 1]) # In[24]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.imshow(sq_distance_matrix, zorder=2, cmap='Blues', interpolation='nearest')
print(Dmat) ################################################ # real Dmat t2 = timeit.default_timer() Dmat_real = np.zeros((len(S), len(S))) for i in range(k): for j in range(i + 1, k): Dmat_real[i, j] = np.linalg.norm(np.matmul(w, S[i]) - np.matmul(w, S[j]), ord=1) Dmat_real[j, i] = Dmat_real[i, j] t3 = timeit.default_timer() ################################################ # PCA import matplotlib.pyplot as plt from sklearn.manifold import MDS mds = MDS(n_components=2, dissimilarity='precomputed') pos = mds.fit(Dmat).embedding_ plt.figure() plt.scatter(pos[:, 0], pos[:, 1], color='black', s=10, lw=0) plt.title("Estimated, time=%s" % (t1 - t0)) plt.show() mds = MDS(n_components=2, dissimilarity='precomputed') pos = mds.fit(Dmat_real).embedding_ plt.figure() plt.scatter(pos[:, 0], pos[:, 1], color='black', s=10, lw=0) plt.title("True, time=%s" % (t3 - t2)) plt.show()
def main_function(num_clusters, retokenize, recluster, corpusdir, dataset_path, n_words, minibatch, num_processes): try: nltk.data.find('tokenizers/stopwords') except: stopwords = nltk.download('stopwords') try: nltk.data.find('tokenizers/punkt') except: nltk.download('punkt') stemmer = SnowballStemmer("english") dataset_name, file_place = initialize_output_location(dataset_path) trailer_text = dataset_name + "_k=" + str(num_clusters) print("\nAll outputs generated will be in \"~\\cluster-datalake-outputs\\" + dataset_name + "--output\"") #=========1=========2=========3=========4=========5=========6======= # tokenize and cluster fnames, dataset = to_retokenize(retokenize, corpusdir, dataset_path, num_processes) tfidf_matrix = np.load(os.path.join(file_place, "tfidf_matrix_" + dataset_name + ".npy")).item() to_recluster(num_clusters, retokenize, recluster, tfidf_matrix, dataset_path, minibatch) # load in existing saved files km = joblib.load(os.path.join(file_place, 'doc_cluster_' + trailer_text + '.pkl')) vocab_frame = pd.read_pickle(os.path.join(file_place, "vocab_frame_" + dataset_name + ".pkl")) terms = np.load(os.path.join(file_place, "terms_" + dataset_name + ".npy")).tolist() dist = np.load(os.path.join(file_place, "distance_matrix_" + dataset_name + ".npy")) print("\nLoaded in existing dependencies...\n") clusters = km.labels_.tolist() # get the actual number of clusters in the dataframe distinct_cluster_labels = [] for label in clusters: if label not in distinct_cluster_labels: distinct_cluster_labels.append(label) # create a dictionary "db" of filenames, contents, and clusters db = {'filename': fnames, 'content': dataset, 'cluster': clusters} # convert "db" to a pandas dataframe frame = pd.DataFrame(db, index=[clusters], columns=['filename','cluster']) # print the number of files in each cluster #print("Number of files in each cluster: ") #print(frame['cluster'].value_counts()) #=========1=========2=========3=========4=========5=========6======= # open file writer for result output fwriter = open(os.path.join(file_place, "doc_clusters_" + trailer_text + ".txt"), "w") fwriter.write("Clusters from text files in: " + corpusdir) fwriter.write("\nTop terms per cluster: \n\n") print("Top terms per cluster: \n") #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] all_cluster_words = {} # for each cluster ''' terms contains all the feature labels of the clustering vocab_frame contains all the tokens mapped to their stemmed counterparts you're finding the token version of the stem you get the stem you get is at the ind position of the terms list ind is from order_centroids order_centroids is a sorted array with num_clusters rows and len(terms) features order_centroids[i] is the coordinates of cluster i order_centroids[i,:] is the coordinates from ALL features for cluster i''' distinct_cluster_labels = sorted(distinct_cluster_labels) for i in distinct_cluster_labels: fwriter.write("Cluster " + str(i) + " words: ") print("Cluster %d words:" % i, end='') cluster_words = [] seen = [] # print the first "n_words" words in a cluster for ind in order_centroids[i, : n_words]: print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=",") fwriter.write(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].rstrip('\n') + ", ") cluster_words.append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0]) print() fwriter.write("\n") all_cluster_words.update({i:cluster_words}) # print out the filenames in the cluster print("Cluster %d filenames:" % i, end='') fwriter.write("Cluster " + str(i) + " filenames: ") for filename in frame.loc[i]['filename'].values.tolist(): print(' %s,' % filename, end='') fwriter.write(filename.rstrip('\n') + ", ") print("\n") fwriter.write("\n\n") fwriter.close() print("Output written to \"doc_clusters_" + trailer_text + ".txt\"") #=========1=========2=========3=========4=========5=========6======== if not os.path.isfile(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy")): retokenize = "1" if retokenize == "1": # multidimensional scaling: convert distance matrix into 3-dimensions mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1) print("\nFitting the distance matrix into 3 dimensions...") pos_save = mds.fit_transform(dist) # shape (n_components, n_samples) np.save(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy"), pos_save) position_array = np.load(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy")) print("Loaded existing MDS fit.") pos = position_array xs, ys, zs = pos[:, 0], pos[:, 1], pos[:, 2] # set up plot fig = plt.figure(figsize=(17,9)) ax = Axes3D(fig) # create data frame with MDS results, cluster numbers, and filenames df = pd.DataFrame(dict(x=xs, y=ys, z=zs, label=clusters, filename=fnames)) # group by cluster groups = df.groupby('label') # for each cluster, plot the files in that cluster print("\n\nPlotting scatterplot of cluster points...") for name, group in tqdm(groups): # color = ('#%06X' % random.randint(0,256**3-1)) color = np.random.rand(3,) for t in range(group.shape[0]): ax.scatter(group.x.iloc[t], group.y.iloc[t], group.z.iloc[t], c=color, marker='o') ax.set_aspect('auto') plt.savefig(os.path.join(file_place, "3D_document_cluster_" + trailer_text + ".svg"), dpi=300) print("Scatter plot written to \"3D_document_cluster_" + trailer_text + ".svg\"") return frame, all_cluster_words, distinct_cluster_labels
# Kmeans++ km = KMeans(n_clusters=29, init='k-means++', max_iter=300, n_init=1, verbose=0, random_state=3425) km.fit(tfidf_matrix) labels = km.labels_ clusters = labels.tolist() # Calculating the distance measure derived from cosine similarity distance = 1 - cosine_similarity(tfidf_matrix) # Dimensionality reduction using Multidimensional scaling (MDS) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(distance) xs, ys = pos[:, 0], pos[:, 1] # Saving cluster visualization after mutidimensional scaling for x, y, in zip(xs, ys): plt.scatter(x, y) # Creating dataframe containing reduced dimensions, identified labels and text data for plotting KMeans output result = pd.DataFrame(dict(label=clusters, data=text, x=xs, y=ys)) topic.to_csv(os.path.join(outfile, 'kmeans_clustered_DFN.csv'), sep=';') #List of cluster listcluster = result.groupby('label').size()
# Multidimensional Scaling from sklearn.manifold import MDS # パラメータの設定 n_components = 2 n_init = 12 max_iter = 1200 metric = True n_jobs = 4 random_state = 2018 # インスタンスの作成 mds = MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, metric=metric, n_jobs=n_jobs, random_state=random_state) # NDSの実行 X_train_mds = mds.fit_transform(X_train.loc[0:1000, :]) # データフレームに変換 X_train_mds = pd.DataFrame(data=X_train_mds, index=train_index[0:1001]) # プロット表示 scatterPlot(X_train_mds, y_train, "Multidimensional Scaling") # 3.8 LLE(局所線形埋め込み) ------------------------------------------------------------
import numpy as np from sklearn.manifold import MDS, TSNE from sklearn.decomposition import PCA from pathlib import Path from matplotlib import pyplot as plt, rcParams root = Path('doc_matrices') paths = list(root.glob('**/*.npy')) mds = MDS(eps=0.0001, max_iter=3000, n_jobs=3, metric=False) tsne = TSNE(n_components=2, init='random', method='exact', perplexity=7, early_exaggeration=100, n_iter=3500) projectors = [tsne] y = '#F0BE41' b = '#5383EC' r = '#D85040' g = '#58A55C' w = '#D8DCD6' colours = [ g, r, r, r, y, b, g, r, y, y, r, r, b, y, w, y, y, r, r, r, b, y, w, b ] plt.close('all') fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(3.33, 3.33 * 0.8), dpi=220) rcParams['font.sans-serif'] = [
def cluster_run(path): df = pd.DataFrame(columns=['File_Name', 'Content']) companies = [] indX = [] os.chdir(path) for file in glob.glob("*.pdf"): raw = parser.from_file(file) text = raw['content'] text = text.replace('\n', ' ') text = text.replace('\t', ' ') text = text.replace('\r', ' ') text = text.replace('\xa0', ' ') text = text.lower() companies.append(file) df1 = {'File_Name': file, 'Content': text[0:30000]} df = df.append(df1, ignore_index=True) for i in range(0, len(companies)): indX.append(i) df.to_csv('pdf_Files_Details1.csv', encoding='utf-8', index=False) df = pd.read_csv("pdf_Files_Details1.csv") text = [] for i in df['Content']: text.append(BeautifulSoup(i, 'html.parser').getText()) stopwords = nltk.corpus.stopwords.words('english') stemmer = SnowballStemmer("english") def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [ word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) ] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems def tokenize_only(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [ word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) ] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) return filtered_tokens totalvocab_stemmed = [] totalvocab_tokenized = [] for q in text: allwords_stemmed = tokenize_and_stem(q) totalvocab_stemmed.extend(allwords_stemmed) allwords_tokenized = tokenize_only(q) totalvocab_tokenized.extend(allwords_tokenized) vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed) words = vocab_frame['words'] words = words.tolist() tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) tfidf_matrix = tfidf_vectorizer.fit_transform(text) terms = tfidf_vectorizer.get_feature_names() dist = 1 - cosine_similarity(tfidf_matrix) #K-Means clustering num_clusters = 4 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() joblib.dump(km, 'doc_cluster.pkl') km = joblib.load('doc_cluster.pkl') clusters = km.labels_.tolist() findata = { 'companies': companies, 'index': indX, 'text': text, 'cluster': clusters } frame = pd.DataFrame(findata, index=[clusters], columns=['index', 'text', 'cluster', 'companies']) frame = frame.sort_values(by='index') MDS() # two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] #strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text def strip_proppers_POS(text): tagged = pos_tag(text.split()) #use NLTK's part of speech tagger non_propernouns = [ word for word, pos in tagged if pos != 'NNP' and pos != 'NNPS' ] return non_propernouns #set up colors per clusters using a dict cluster_colors = { 0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e' } #set up cluster names using a dict cluster_names = { 0: 'Cluster 0', 1: 'Cluster 1', 2: 'Cluster 2', 3: 'Cluster 3' } #create data frame that has the result of the MDS plus the cluster numbers and titles data = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=companies)) #group by cluster groups = data.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params(\ axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for i in range(len(data)): ax.text(data.iloc[i]['x'], data.iloc[i]['y'], data.iloc[i]['title'], size=8) p = os.path.abspath( r'C:\Users\Rohan.Gupta.USNIIT-TECH\Downloads\LeaseModel\templates\plot.html' ) clust = mpld3.save_html(fig, p)
def clust(num_clusters, tfidf_matrix, dist, titles, ranks, sum_all): km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() films = { 'title': titles, 'rank': ranks, 'synopsis': sum_all, 'cluster': clusters } frame = pd.DataFrame(films, index=[clusters], columns=['title', 'rank', 'cluster']) print(frame['cluster'].value_counts()) print("\n") grouped = frame['rank'].groupby( frame['cluster']) #groupby cluster for aggregation purposes print(grouped.mean()) print("Top terms per cluster:") print() #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(num_clusters): print("Cluster %d words:" % i, end='') for ind in order_centroids[i, :6]: #replace 6 with n words per cluster try: print(' %s' % vocab_frame.loc[terms[ind].split( ' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') except: pass print() #add whitespace print() #add whitespace # print("Cluster %d titles:" % i, end='') # for title in frame.loc[i]['title'].values.tolist(): # print(' %s,' % title, end='') # print() #add whitespace # print() #add whitespace MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] print() print() #set up colors per clusters using a dict cluster_colors = { 0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e' } #set up cluster names using a dict cluster_names = {0: '1', 1: '2', 2: '3', 3: '4', 4: '5'} #some ipython magic to show the matplotlib plots inline get_ipython().run_line_magic('matplotlib', 'inline') #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame( dict(x=xs, y=ys, label=clusters, title=titles, rank=ranks)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for i in range(len(df)): ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8) plt.show() #show the plot
'SpectralEmbedding', 'PCA' ] doc_top = np.load('doc_top.npy') top_word = np.load('top_word.npy') fig_doc = plt.figure(1) fig_doc.suptitle('Manifold Learning for document in NIPS(after LDA)') fig_word = plt.figure(2) fig_word.suptitle('Manifold Learning for word in NIPS(after LDA)') for i, method in enumerate(methods): print('{} starts......'.format(method)) if method in LLE_dict.keys(): manifold = LocallyLinearEmbedding(n_neighbors=6, method=LLE_dict[method], eigen_solver='dense') elif method == 'MDS': manifold = MDS(n_init=1, max_iter=100) elif method == 'PCA': manifold = PCA(n_components=2) else: exec('manifold={}(n_neighbors=6)'.format(method)) doc_2D = manifold.fit_transform(doc_top) np.save('doc_2d_{}'.format(method), doc_2D) word_2D = manifold.fit_transform(top_word.T) np.save('word_2d_{}'.format(method), word_2D) ax = fig_doc.add_subplot(241 + i) ax.scatter(doc_2D[:, 0], doc_2D[:, 1], s=1) ax.set_title(method) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') ax = fig_word.add_subplot(241 + i)
def __plot_samples__(self, dfs, fold): """ :type dfs: List[pandas DataFrame] # [training df, testing df] :type fold: int :rtype: None """ mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='euclidean', n_jobs=-1) tsne = TSNE(n_components=2) # change label to color index # author 1 train (0 = light blue), author 1 test (1 = dark blue) # author 2 train (2 = light green), author 2 test (3 = dark green) df_all = pd.DataFrame(columns=dfs[0].columns) df0_copy = dfs[0].copy() df0_copy.loc[(df0_copy.label == 1).values, 'label'] = 0 df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2 df_all = df_all.append(df0_copy) df1_copy = dfs[1].copy() df1_copy.loc[(df1_copy.label == 1).values, 'label'] = 1 df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3 df_all = df_all.append(df1_copy) legend = { 0: 'Author 1 Training Sample', 1: 'Author 1 Test Sample', 2: 'Author 2 Training Sample', 3: 'Author 2 Test Sample' } # fit on training data pos_lst = [('Multi-Dimensional Scaling (MDS)', mds.fit(df_all.drop('label', axis=1)).embedding_), ('t-Distributed Stochastic Neighbor Embedding (TSNE)', tsne.fit(df_all.drop('label', axis=1)).embedding_)] # plot colors = sns.color_palette('Paired', 4) fig = plt.figure(figsize=(16, 7)) plt.hold(True) for k, (title, pos) in enumerate(pos_lst, 1): ## fig.add_subplot() works in ipython notebook but creates a ## mysterious 3rd axes in python... # ax = fig.add_subplot(1,2,k) ax = plt.subplot(1, 2, k) ax.set_title(title) for i in xrange(len(colors)): samples = pos[(df_all.label == i).values, :] ax.scatter(samples[:, 0], samples[:, 1], c=colors[i], edgecolor='none', label=legend[i]) ax.legend() plt.hold(False) plt.savefig('../figs/' + \ self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \ 'fold' + str(fold) + '.png', dpi=300, transparent=True) plt.close(fig)
def tweet_clusters(): docs = read_tweet_docs() cv = TfidfVectorizer(stop_words=tfidf_stop_words, ngram_range=(1, 2), min_df=0.1, max_df=0.7) tfidf_matrix = cv.fit_transform(docs) terms = cv.get_feature_names() dist = 1 - sk_cos_sim(tfidf_matrix) num_clusters = 10 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() print(clusters) senators = {'name': twitterhandles, 'cluster': clusters} frame = pd.DataFrame(senators, index=[clusters], columns=['name', 'cluster']) print(frame.head()) print("Top terms per cluster:") print() #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] cluster_array = [] for i in range(num_clusters): cluster = {"words": [], "senators": []} #print("Cluster %d words:" % i, end='') for ind in order_centroids[ i, :10]: #replace 6 with n words per cluster #print(' %s' % terms[ind], end=',') cluster["words"].append(terms[ind]) print() #add whitespace print() #add whitespace #print("Cluster %d names:" % i, end='') for title in frame.loc[i]['name'].values.tolist(): #print(' %s,' % title, end='') cluster["senators"].append(title) cluster_array.append(cluster) #print(json.dumps(cluster)) print() #add whitespace print() #add whitespace print() print() print(json.dumps(cluster_array)) cluster_file = open("cluster_file.json", "w") cluster_file.write(json.dumps(cluster_array)) cluster_file.close() #set up colors per clusters using a dict cluster_colors = { 0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e' } #set up cluster names using a dict cluster_names = { 0: 'Family, home, war', 1: 'Police, killed, murders', 2: 'Father, New York, brothers', 3: 'Dance, singing, love', 4: 'Killed, soldiers, captain' } MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] print() print() #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=twitterhandles)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: #ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, # label=cluster_names[name], color=cluster_colors[name], # mec='none') ax.set_aspect('auto') ax.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params(\ axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for i in range(len(df)): ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)
plt.show() # calculated distance matrix from sklearn.metrics import pairwise_distances D = pairwise_distances(X) print(D.shape) plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest') plt.colorbar() plt.show() # MDS: distance matrix ---- coordinate representation from sklearn.manifold import MDS model = MDS(n_components=2, dissimilarity='precomputed', random_state=1) out = model.fit_transform(D) plt.scatter(out[:, 0], out[:, 1], **colorize) plt.axis('equal') plt.show() print('#---------------------------------#') print(' MDS as Manifold learning ') print('#---------------------------------#') print("\n") def random_projection(X, dimension=3, rseed=42): assert dimension >= X.shape[1] rng = np.random.RandomState(rseed) C = rng.randn(dimension, dimension)
], [ 0.583399634410048, 0.419747266788418, 1, 0.568688414138699, 0.522467346811204, 0.397952306807963 ], [ 0.592856152221665, 0.412688190059770, 0.568688414138699, 1, 0.502786232321456, 0.399008249004968 ], [ 0.539470828139999, 0.386710678519393, 0.522467346811204, 0.502786232321456, 1, 0.351377435042073 ], [ 0.387598235354629, 0.432336134790601, 0.397952306807963, 0.399008249004968, 0.351377435042073, 1 ]]) # dissimilarity is 1 minus similarity dissimilarities = 1 - S # compute the embedding coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities) plt.scatter(coord[:, 0], coord[:, 1]) # Label the points for i in range(coord.shape[0]): plt.annotate(str(i), (coord[i, :])) plt.show()
def dimension_reduction(data, index, method='tsne', label=None, plot=False): n_components = 2 # 所有降维方法都是基于距离的,需要保证特征距离标准化 scaler = StandardScaler().fit(data) data = scaler.transform(data) # 大多数情况下,用tsne 将高纬度数据用二维方式展示出来。不同方法采用不同的特征映射方法计算出 # 不同的X,用fittransform方法进行标准化,这里是最小-最大规范化 if method == 'tsne': model = TSNE(n_components=n_components, perplexity=20, early_exaggeration=100.0, method='exact', learning_rate=100, n_iter=1000, random_state=250, verbose=2) X = model.fit_transform(data) # X是两列数据,经过了聚类+规范化 if method == 'isomap': model = Isomap(n_components=n_components, n_neighbors=20) X = model.fit_transform(data) if method == 'MDS': model = MDS(n_components=n_components, verbose=2, n_init=1, max_iter=500) X = model.fit_transform(data) if method == 'tsne_v2': X = tsne(data, 2, 44, 50.0) data_len = len(X) # 统计X长度 print(data_len) # data_len = 1653 print(X) # 二维数组,(1653L,2L) if plot: fig, ax = plt.subplots() # 说明有几个子图,数量未定 # plt.subplot(2, 1, 1)#面板设置成2行1列,并取第一个(顺时针编号) # plt.plot(x1, y1, 'yo-')#画图,染色 # plt.scatter(X[label==0,0],X[label==0,1],c='darkblue',alpha=0.25,marker='^') # plt.scatter(X[label==1,0],X[label==1,1],c='darkred',alpha=0.75,marker='x') # plt.scatter(X[label==2,0],X[label==2,1],c='green',alpha=0.25,marker='o') # plt.xlim([np.min(X[label==0,0]),np.max(X[label==0,0])]) # plt.ylim([np.min(X[label==0,1]),np.max(X[label==0,1])]) ax.scatter(X[label == 0, 0], X[label == 0, 1], c='darkblue', alpha=0.25, marker='^') ax.scatter(X[label == 1, 0], X[label == 1, 1], c='darkred', alpha=0.75, marker='x') ax.scatter(X[label == 2, 0], X[label == 2, 1], c='green', alpha=0.25, marker='o') ax.set_xlim([np.min(X[label == 0, 0]), np.max(X[label == 0, 0])]) ax.set_ylim([np.min(X[label == 0, 1]), np.max(X[label == 0, 1])]) idxList = [] nameList = [] for i, ind in enumerate(index): if not ((-20 < X[ind, 0] < 20) and (-20 < X[ind, 1] < 20)): print(ind) idxList.append(ind) nameList.append(name[ind]) # plt.annotate('This is awesome!', xy=(76, 0.75), ax.annotate(str(ind), xy=(X[ind, 0], X[ind, 1])) # # ax.annotate(str(ind), X[ind, 0], X[ind, 1]) print idxList print nameList plt.show() outPut = {'Index': idxList, 'Video_Name': nameList} print outPut output_Archive = pd.DataFrame(outPut) output_Archive.to_csv('output_Archive.csv') return X
def __init__(self, dimensionality=2500, seed=None): rnd_state = np.random.RandomState(seed=seed) self.mds = MDS(n_components=dimensionality, n_jobs=-1, random_state=rnd_state, dissimilarity="precomputed")
def plotPCA(df, true_k, clusters, X, english=False): # Plot in 2d with PCA dist = 1 - cosine_similarity(X) MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] import matplotlib.cm as cm # set up colors per clusters using a dict cluster_colors = cm.rainbow(np.linspace(0, 1, true_k)) # set up cluster names using a dict # cluster_names = {i: 'i' for i in range(true_k)} # create data frame that has the result of the MDS plus the cluster # numbers and titles df2 = pd.DataFrame( dict(x=xs, y=ys, label=clusters, title=df[0], title2=df[2])) # group by cluster groups = df2.groupby('label') pd.set_option('display.max_rows', len(df2)) # print(df2.sort_values(by='label')[['label', 'title', 'title2']]) filename = './labels.%s.csv' % ('en' if english else 'es') df2.sort_values(by='label')[['label', 'title', 'title2']].to_csv(filename) pd.reset_option('display.max_rows') # set up plot fig, ax = plt.subplots(figsize=(25, 25)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling # iterate through groups to layer the plot # note that I use the cluster_name and cluster_color dicts with the 'name' # lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') # ax.legend(numpoints=1) #show legend with only 1 point # add label in x,y position with the label as the film title for i in range(len(df2)): ax.text(df2.ix[i]['x'], df2.ix[i]['y'], df2.ix[i]['title'], size=4) # plt.show() # show the plot # plt.savefig('test.pdf', format='pdf') # , dpi=600) # plt.savefig('test.eps', format='eps') # , dpi=600) # plt.savefig('clusters_small_noaxes.png') # , dpi=600) plt.close() class TopToolbar(mpld3.plugins.PluginBase): """Plugin for moving toolbar to top of figure""" JAVASCRIPT = """ mpld3.register_plugin("toptoolbar", TopToolbar); TopToolbar.prototype = Object.create(mpld3.Plugin.prototype); TopToolbar.prototype.constructor = TopToolbar; function TopToolbar(fig, props){ mpld3.Plugin.call(this, fig, props); }; TopToolbar.prototype.draw = function(){ // the toolbar svg doesn't exist // yet, so first draw it this.fig.toolbar.draw(); // then change the y position to be // at the top of the figure this.fig.toolbar.toolbar.attr("x", 150); this.fig.toolbar.toolbar.attr("y", 400); // then remove the draw function, // so that it is not called again this.fig.toolbar.draw = function() {} } """ def __init__(self): self.dict_ = {"type": "toptoolbar"} # create data frame that has the result of the MDS plus the cluster # numbers and titles df3 = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=df[0])) # group by cluster groups = df3.groupby('label') # define custom css to format the font and to remove the axis labeling css = """ text.mpld3-text, div.mpld3-tooltip { font-family:Arial, Helvetica, sans-serif; } g.mpld3-xaxis, g.mpld3-yaxis { display: none; } svg.mpld3-figure { margin-left: 200px;} """ # Plot fig, ax = plt.subplots(figsize=(25, 25)) # set plot size ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling # iterate through groups to layer the plot # note that I use the cluster_name and cluster_color dicts with the 'name' # lookup to return the appropriate color/label for name, group in groups: points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, mec='none', color=cluster_colors[name]) ax.set_aspect('auto') labels = [i for i in group.title] # set tooltip using points, labels and the already defined 'css' tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10, css=css) # connect tooltip to fig mpld3.plugins.connect(fig, tooltip, TopToolbar()) # set tick marks as blank ax.axes.get_xaxis().set_ticks([]) ax.axes.get_yaxis().set_ticks([]) # set axis as blank ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.legend(numpoints=1) # show legend with only one dot mpld3.display() # show the plot # uncomment the below to export to html html = mpld3.fig_to_html(fig) name = 'name.%s.html' % ('en' if english else 'es') mpld3.save_html(fig, name)
def get_embedding(dm, dim): # Return embedding space return MDS(n_components=dim, metric=True, dissimilarity="precomputed").fit_transform(dm)