def create_2dprojection(distmat): #uses isomap to return a species distance map in 2d based on the topological distmat of all species in tree print 'map to 3d space' mapper=MDS(n_components=3, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=-1, random_state=0, dissimilarity='precomputed') projmat =mapper.fit_transform(distmat) print 'DONE' return projmat
def mds(similarity, euclid=False): if euclid: model = MDS(max_iter=1000) result = model.fit_transform(similarity) else: model = MDS(max_iter=1000, dissimilarity='precomputed') result = model.fit_transform(1 - similarity) return result.T
def project_in_2D(distance_mat, method='mds'): """ Project SDRs onto a 2D space using manifold learning algorithms :param distance_mat: A square matrix with pairwise distances :param method: Select method from 'mds' and 'tSNE' :return: an array with dimension (numSDRs, 2). It contains the 2D projections of each SDR """ seed = np.random.RandomState(seed=3) if method == 'mds': mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(distance_mat).embedding_ nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit_transform(distance_mat, init=pos) elif method == 'tSNE': tsne = TSNE(n_components=2, init='pca', random_state=0) pos = tsne.fit_transform(distance_mat) else: raise NotImplementedError return pos
def main(): digits = load_digits() X = digits.data y = digits.target mds = MDS() X_mds = mds.fit_transform(X) plot_embedding(X_mds, y)
def main(): args = docopt(__doc__) is_mds = args['--mds'] # load datasets digits = load_digits() X = digits.data y = digits.target labels = digits.target_names # dimension reduction if is_mds: model = MDS(n_components=2) else: model = PCA(n_components=2) X_fit = model.fit_transform(X) for i in range(labels.shape[0]): plt.scatter(X_fit[y == i, 0], X_fit[y == i, 1], color=COLORS[i], label=str(i)) plt.legend(loc='upper left') plt.autoscale() plt.grid() plt.show()
def scale_plot(input_data, data_colors=None, cluster_colors=None, cluster_sizes=None, dissimilarity='euclidean', filey=None): """ Plot MDS of data and clusters """ if data_colors is None: data_colors = 'r' if cluster_colors is None: cluster_colors='b' if cluster_sizes is None: cluster_sizes = 2200 # scale mds = MDS(dissimilarity=dissimilarity) mds_out = mds.fit_transform(input_data) with sns.axes_style('white'): f=plt.figure(figsize=(14,14)) plt.scatter(mds_out[n_clusters:,0], mds_out[n_clusters:,1], s=75, color=data_colors) plt.scatter(mds_out[:n_clusters,0], mds_out[:n_clusters,1], marker='*', s=cluster_sizes, color=cluster_colors, edgecolor='black', linewidth=2) # plot cluster number offset = .011 font_dict = {'fontsize': 17, 'color':'white'} for i,(x,y) in enumerate(mds_out[:n_clusters]): if i<9: plt.text(x-offset,y-offset,i+1, font_dict) else: plt.text(x-offset*2,y-offset,i+1, font_dict) if filey is not None: plt.title(path.basename(filey)[:-4], fontsize=20) save_figure(f, filey) plt.close()
def plotFlatClusterGraph(tf_idf_matrix, clusters, headlines_utf): dist = 1 - cosine_similarity(tf_idf_matrix) MDS() mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) xs, ys = pos[:, 0], pos[:, 1] cluster_colors = {0: '#FE642E', 1: '#B40404', 2: '#D7DF01', 3: '#01DF01', 4: '#00FFBF', 5: '#2E64FE', 6:'#8904B1', 7:'#FA58F4', 8:'#FE2E9A', 9:'#A4A4A4'} #create data frame that has the result of the MDS plus the cluster numbers and titles df = pandas.DataFrame(dict(x=xs, y=ys, label=clusters, title=headlines_utf)) groups = df.groupby('label') # set up plots fig, ax = plt.subplots(figsize=(17, 9)) # set size #iterate through groups to layer the plots for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for t_n in range(len(df)): ax.text(df.ix[t_n]['x'], df.ix[t_n]['y'], df.ix[t_n]['title'], size=8) plt.savefig('../plots/flat_clusters.png', dpi=400)
def reorder_channels_by_xyz_coord(data, channel_names=None): """ :param data: 2-d array in the format [n_samples, n_channels] :param channel_names: names of the EEG channels :return: data, channel_names permutated accordingly """ # work on transposed view, i.e. [channel, samples] data = data.T # map channels to 1-d coordinates through MDS from sklearn.manifold import MDS distances = compute_electrode_distance_matrix() mds = MDS(n_components=1, dissimilarity='precomputed') projection = mds.fit_transform(distances).reshape(data.shape[0]) order = np.argsort(projection) print mds.stress_ print order # re-order channels data = data[order] # restore initial axes layout data = data.T # re-order channel_names channel_names = reorder_channel_names(channel_names, order) return data, channel_names
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'): if hasattr(data, '__iter__'): iterable = data else: raise Exception('ERROR: Input must be iterable') import itertools iterable_1, iterable_2 = itertools.tee(iterable) # get labels labels = [] for graph in iterable_2: label = graph.graph.get('id', None) if label: labels.append(label) # transform iterable into sparse vectors data_matrix = vectorizer.transform(iterable_1) # embed high dimensional sparse vectors in 2D from sklearn import metrics distance_matrix = metrics.pairwise.pairwise_distances(data_matrix) from sklearn.manifold import MDS feature_map = MDS(n_components=n_components, dissimilarity='precomputed') explicit_data_matrix = feature_map.fit_transform(distance_matrix) from sklearn.decomposition import TruncatedSVD pca = TruncatedSVD(n_components=2) low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix) plt.figure(figsize=(size, size)) embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap) plt.show()
def visualize_clusters(tfidf_matrix, vocabulary, km): # calcuate the cosine distance between each document # this will be used for plotting on a euclidean (2-dimensional) plane. dist = 1 - cosine_similarity(tfidf_matrix) clusters = km.labels_.tolist() # convert two components as we are plotting points in a two-dimensional plane # 'precomputed' because we provide a distance matrix # we will also specify 'random_state' so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # set up colors per clusters using a dict cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e', 5: '#99cc00'} # set up cluster names using a dict (perhaps using the top terms of each cluster) cluster_names = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5'} #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params(\ axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point plt.show() #show the plot
def generate_cluster_plot_frame(self): MDS() mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) dist = 1 - cosine_similarity(self.tfidf_matrix) pos = mds.fit_transform(dist) xs, ys = pos[:,0], pos[:,1] self.cluster_plot_frame = pd.DataFrame(dict(x=xs, y=ys, label=self.clusters, chapter=self.chapter_list, book=self.book_list))
def plot_clusters(num_clusters, feature_matrix, cluster_data, movie_data, plot_size=(16,8)): # generate random color for clusters def generate_random_color(): color = '#%06x' % random.randint(0, 0xFFFFFF) return color # define markers for clusters markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd'] # build cosine distance matrix cosine_distance = 1 - cosine_similarity(feature_matrix) # dimensionality reduction using MDS mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # get coordinates of clusters in new low-dimensional space plot_positions = mds.fit_transform(cosine_distance) x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1] # build cluster plotting data cluster_color_map = {} cluster_name_map = {} for cluster_num, cluster_details in cluster_data.items(): # assign cluster features to unique label cluster_color_map[cluster_num] = generate_random_color() cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip() # map each unique cluster label with its coordinates and movies cluster_plot_frame = pd.DataFrame({'x': x_pos, 'y': y_pos, 'label': movie_data['Cluster'].values.tolist(), 'title': movie_data['Title'].values.tolist() }) grouped_plot_frame = cluster_plot_frame.groupby('label') # set plot figure size and axes fig, ax = plt.subplots(figsize=plot_size) ax.margins(0.05) # plot each cluster using co-ordinates and movie titles for cluster_num, cluster_frame in grouped_plot_frame: marker = markers[cluster_num] if cluster_num < len(markers) \ else np.random.choice(markers, size=1)[0] ax.plot(cluster_frame['x'], cluster_frame['y'], marker=marker, linestyle='', ms=12, label=cluster_name_map[cluster_num], color=cluster_color_map[cluster_num], mec='none') ax.set_aspect('auto') ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off') fontP = FontProperties() fontP.set_size('small') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=True, ncol=5, numpoints=1, prop=fontP) #add labels as the film titles for index in range(len(cluster_plot_frame)): ax.text(cluster_plot_frame.ix[index]['x'], cluster_plot_frame.ix[index]['y'], cluster_plot_frame.ix[index]['title'], size=8) # show the plot plt.show()
def mds_embed(graph): sorted_node_list = sorted(list(graph.nodes()), key=len) dmat = nx.floyd_warshall_numpy(graph, nodelist=sorted_node_list) gmds = MDS(n_jobs=-2, dissimilarity="precomputed") embed_pts = gmds.fit_transform(dmat) return (embed_pts, dmat, sorted_node_list)
def compute_2d_mapping(layout): sphere_coords = layout.sphere_coords() radius = layout.sphere_radius() from sklearn.manifold import MDS distances = compute_electrode_distance_matrix(sphere_coords, radius) mds = MDS(n_components=2, dissimilarity='precomputed') projection = mds.fit_transform(distances) # print projection.shape return projection
def convert_matrix_to_coordinates(sym_matrix, components): """ :param sym_matrix: array, [n_samples, n_samples] :param components: int: 2 or 3 for MDS :return: Output of MDS, xy or xyz coordinates as 2d numpy array with shape [n_samples, components] """ # Create coordinates based on multi dimensional scaling mds = MDS(n_components=components, dissimilarity="precomputed", random_state=1) coordinates = mds.fit_transform(sym_matrix) return coordinates
def plotMDS(X, Y): #computes and plots MDS (measure for how well data separates) D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X)) tmodel = MDS(n_components=2, dissimilarity='precomputed') X2D = tmodel.fit_transform(D) plt.figure() plt.title('MDS') plt.ylabel('MDS1') plt.xlabel('MDS2') plt.scatter(X2D[:, 0], X2D[:, 1], c=Y) plt.show()
def mds(cos_simil_mtr): # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(cos_simil_mtr) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] print() return xs, ys
def generate_cluster_plot_frame(self): mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) dist = 1 - cosine_similarity(self.tfidf_matrix) pos = mds.fit_transform(dist) xs, ys = pos[:, 0], pos[:, 1] cluster_data = dict() cluster_data["x"] = xs cluster_data["y"] = ys cluster_data["label"] = self.clusters cluster_data["presentation"] = self.presentation_list cluster_data["innovation_list"] = self.innovation_list self.cluster_plot_frame = pd.DataFrame(cluster_data)
def mds_mapping(X, n_components=2, max_iter=500, n_jobs=-1, random_state=None): """MDS scaling applied to data matrix X Parameters ---------- X : array, shape (n_samples, n_features) The data matrix n_components : int, optional Dimensionality of the reduced mapping max_iter : int, optional Max number of iterations n_jobs: int, optional Number of compute jobs when fitting scale. -1 means number of processors on the current computer. random_state : int, optional Generator used to initialize, set fixed integer to reproduce results for debugging. Returns ------- mds_embedding: MDS object The embedding object. X_transformed : array, shape (n_samples, n_components) The transformed data. Examples -------- >>> data = np.random.rand(5, 10) >>> MDS_reduced, transformed_data = mds_mapping(data, n_components=3) >>> transformed_data.shape (5, 3) """ mds_embedding = MDS(n_components=n_components, max_iter=max_iter, n_jobs=n_jobs, random_state=random_state) mds_embedding.fit_transform(X) X_transformed = mds_embedding.embedding_ return mds_embedding, X_transformed
def transform_and_plot_data(seed, distance_matrix, dim_x, dim_y, title, plot3D, ax): if plot3D: n_components = 3 else: n_components = 2 mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) transformed_data = mds.fit_transform(distance_matrix) corner_points, pair_list = create_pairs_to_plot_from_list(transformed_data, dim_x, dim_y) if plot3D: my_plot3D(corner_points, pair_list, False, title, ax) else: my_plot2D(corner_points, pair_list, False, title, ax)
def mds_bib_data_with_sklearn(fname): bib_data = get_bib_data() mat, years, term_list, years_cnt = get_year_by_term_mat(bib_data, freq=5) # Euclidean-based MDS aMDS = MDS(n_components=2, dissimilarity='euclidean') coords = aMDS.fit_transform(mat) fig = plt.figure() fig.clf() for label, x, y in zip(years, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x,y)) plt.savefig(fname)
def mds(self, n_components=2, dissimilarity='precomputed', show=False): """ Calculates lower dimention coordinates using the mds algorithm. This requires sklearn ver 0.14 due to the dissimilarity argument. :param n_components: dimentionality of the reduced space. :type n_components: int, optional :param show: Shows the calculated coordinates if true. :type show: boolean, optional """ model = MDS(n_components=n_components, dissimilarity=dissimilarity, max_iter=100) self.pos = model.fit_transform(self.dismat) if show: return self.pos
def js_MMDS(distributions, **kwargs): """Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling Parameters ---------- distributions : array-like, shape (`n_dists`, `k`) Matrix of distributions probabilities. **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()` Returns ------- mmds : array, shape (`n_dists`, 2) """ dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon)) model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs) return model.fit_transform(dist_matrix)
def MDSPlots(images,compressed): """ generator of pyplot figures """ from sklearn.manifold import MDS mds = MDS(n_components = 2,dissimilarity = "precomputed") print "calculating similarities" from scipy.spatial.distance import squareform, pdist #similarities = squareform(pdist(compressed,'mahalanobis')) similarities = squareform(pdist(compressed,'euclidean')) print "fitting mds" coords = mds.fit_transform(similarities) import visualize as viz print "create figure" fig = viz.imgScatter(coords,images) return fig
def multidimensioanl_scaling(file_name, dimension, label): balls = np.loadtxt(file_name) matrix = balls[:, 0:dimension] new_matrix = convert_angles_to_cos_sin(matrix) mds = MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-6, n_jobs=1, random_state=None, dissimilarity='euclidean') transformed_matrix = mds.fit_transform(new_matrix) ball_coords = np.zeros((balls.shape[0], dimension+3)) for i in xrange(balls.shape[0]): ball_coords[i, 0:dimension] = balls[i, 0:dimension].tolist() ball_coords[i, dimension:dimension+2] = transformed_matrix[i] if label == 'cluster': ball_coords[i, dimension+2] = balls[i, dimension].tolist() elif label == 'eq': ball_coords[i, dimension+2] = (-0.0019872041*300*np.log(abs(balls[i, dimension+1]))).tolist() elif label == 'committor': ball_coords[i, dimension+2] = (balls[i, dimension+2]/abs(balls[i, dimension+1])).tolist() print ' '.join([str(x) for x in ball_coords[i, :]])
def perform(rank_num, group, minCnt=30, dim=2, tsne=False): method = "MDS" X, y, names = load_data(tetraFile='db2_tetra_{0}_{1}.npy'.format(ranks[rank_num], group), taxonFile='db2_taxon_{0}_{1}.npy'.format(ranks[rank_num], group), rank=rank_num + 1, minCnt=minCnt) if X.shape[0] == 0: print "Error: sample size is 0" return mds = MDS(n_components=2, n_init=1, max_iter=100) X_2d = mds.fit_transform(X) title = "MDS of TNA of " + group figName = "MDS_{0}_{1}".format(ranks[rank_num][0], group) plot2d(X_2d, y, names, title, figName) print '...clustering' name = '{0}_{1}'.format(ranks[rank_num][0], group) clust = Clustering(name, method, X_2d, y) #d_ami, d_nmi, d_vmes, d_ari = clust.dbscan(5) eval_tuple = clust.agglomerative(linkage='ward', connect=True) clust.plotCluster(alg='ward') with open('result_score.txt', 'a') as f: mse_result = '-' general_str = "{rank}\t{group}\t{method}\t{size}\t{n_cluster}\t{mse}\t{dim}\t".format(rank=ranks[rank_num], group=group, method=method, n_cluster=np.unique(y).shape[0], dim=dim, size=X_2d.shape[0], mse=mse_result) #d_values_str = "{ami:0.3f}\t{nmi:0.3f}\t{vmes:0.3f}\t{ari:0.3f}\t".format(ami=d_ami, nmi=d_nmi, # vmes=d_vmes, ari=d_ari) #line = general_str + d_values_str line = general_str for i in range(len(eval_tuple) / 4): a_ami, a_nmi, a_vmes, a_ari = eval_tuple[i], eval_tuple[i+1], eval_tuple[i+2], eval_tuple[i+3] a_values_str = "{ami:0.3f}\t{nmi:0.3f}\t{vmes:0.3f}\t{ari:0.3f}\t".format(ami=a_ami, nmi=a_nmi, vmes=a_vmes, ari=a_ari) line += a_values_str ### END - for i line += '\n' f.write(line)
def plotlyplt(self): # Find full-D distances between texts dist = 1 - cosine_similarity(self.dtm) # Dimensionality reduction to 3-D mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) pos = np.array(pos) # Get array of names to show on plot self.names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames] scatter = dict( mode = "markers+text", name = "y", type = "scatter3d", x = pos[:, 0], y = pos[:, 1], z = pos[:, 2], text=self.names, textfont=dict(size=8), marker = dict( size=2, color="rgb(23, 190, 207)" ) ) # clusters = dict( # alphahull = 7, # name = "y", # opacity = 0.1, # type = "mesh3d", # x = pos[:, 0], y = pos[:, 1], z = pos[:, 2] # ) layout = dict( title = 'cosine distance between texts', scene = dict( xaxis = dict( zeroline=False ), yaxis = dict( zeroline=False ), zaxis = dict( zeroline=False ), ) ) #fig = dict( data=[scatter, clusters], layout=layout ) fig = dict( data=[scatter], layout=layout ) # Use py.iplot() for IPython notebook url = py.plot(fig, filename='3d point clustering cosine distance')
def reorder_channels_by_similarity(data, channel_names=None, normalize=True): """ :param data: 2-d array in the format [n_samples, n_channels] :param channel_names: names of the EEG channels :param normalize: if True, normalize data first before computing distances :return: data, channel_names permutated accordingly """ # work on transposed view data = data.T # normalize first if normalize: # work on a copy data_copy = data.copy() for c in xrange(data_copy.shape[0]): data_copy[c] -= data_copy[c].mean() data_copy[c] /= data_copy[c].std() else: data_copy = data # project to 1-d from sklearn.manifold import MDS mds = MDS(n_components=1) projection = mds.fit_transform(data_copy).reshape(data_copy.shape[0]) # print p.shape order = np.argsort(projection) # print order # the operation is not happening "in-place": a copy of the # subarray in sorted order is made, and then its contents # are written back to the original array data = data[order] # restore initial axes layout data = data.T # re-order channel_names channel_names = reorder_channel_names(channel_names, order) return data, channel_names
def get_mds(dist_matrix, n_dims=3, metric=True, rotate=True, normalize=True, random_state=SEED): transformer = MDS( n_components=n_dims, metric=metric, n_init=4, max_iter=300, verbose=1, eps=0.001, n_jobs=3, dissimilarity="precomputed", random_state=random_state, ) transformed = transformer.fit_transform(dist_matrix) if rotate: # Rotate the data to a hopefully consistent orientation clf = PCA(n_components=n_dims) transformed = clf.fit_transform(transformed) if normalize: transformed = normalize_var(transformed) return transformed
def multidim(X, vectorizerType="tf", notitles=False, metric="euclidean"): """Multidimensional scaling on a books x word count array Args ---- X: ndarray The array of term frequencies or TF-IDF Returns out: ndarray """ multi = MDS() # Provides the points to plot each of the books out = multi.fit_transform(X) min_x, min_y = np.min(out, axis=0) max_x, max_y = np.max(out, axis=0) plt.clf() fig = plt.figure() ax = fig.add_subplot(111) plt.ylim((min_y-0.5, max_y+0.5)) plt.xlim((min_x-0.5, max_x+0.5)) plt.xlabel('x') plt.ylabel('y') plt.title('Multi-Dimensional Scaling of Antiquity Texts') for i, book in enumerate(abb): ax.annotate(book, xy=out[i]) plt.tight_layout() if notitles: name = "MDS_{}_notitles_{}.pdf".format(vectorizerType, metric) else: name = "MDS_{}_{}.pdf".format(vectorizerType, metric) plt.savefig(name) return out
n = 20 print 'top %d terms pr. group' %n , '\n' for i in range(k): print 'group %d content:' % i, '\n' for ind in order_centroids[i, :n]: print ' %s' % terms1[ind] + ', ' print '-----' len(terms1) ## MDS for vis, mds is inefficient for medium to large data sets, use pcs/simple mds instead import matplotlib.pyplot as plt import matplotlib as mpl from sklearn.manifold import MDS mds = MDS(n_components = 2, dissimilarity='precomputed', random_state = 1) mdl2 = mds.fit_transform(dist) xs, ys = mdl2[:,0], mdl2[:,1] group_col = {0:'#1b9e77',1:'#d95f02'} group_nom = {0: 'group1', 1:'group2'} df = pd.DataFrame(dict(x=xs, y=ys, label=class_mdl1)) df.head() groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(9, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
def transform(self, graph_file, first_node=None): logging.info('loading graph') """ input: csv file of graph; formate: start_node, end_node, weight output: graph, a list, the elements are tuples, like [(1, 2, 1) (3, 1, 1) (2, 3, 1)] count amount of nodes from G """ self.graph = self.load_graph(graph_file) # obtain a array of graph self.node_count = self.find_node_count( self.graph) # find the number of nodes in graph self.edge_count = len(self.graph) print("nodes:", self.node_count) print("edges:", self.edge_count) self.node_range = range(1, self.node_count + 1) logging.info('computing distance matrix') self.distance_matrix = self.compute_distance_matrix( self.graph, self.node_count) # self.distance_matrix = self.nomalization_distance_mtrix(distance_matrix=self.distance_matrix) # nomalized distance matrix ############################## adjacency matrix ########################################## self.adjacency_matrix = self.get_adjacency_matrix( self.graph, self.node_count) ########################################################################### if first_node is None: """self.first_node = randint(0, self.node_count) + 1 # Choose the first pivot from V randomly.""" self.first_node = randint(1, self.node_count) else: self.first_node = first_node # Specify the first pivot. logging.info('finding pivots') """ dimensions=m choose m pivots according to k-center. """ ##################################################### if self.pivot_select == "randomly": self.pivot_nodes = self.choose_pivots_randomly( dimension=self.dimension, number_nodes=self.node_count) ##################################################### else: self.pivot_nodes = self.choose_pivot_points( self.graph, self.dimension) # self.pivot_nodes: a list logging.info('drawing graph in high dimensional space') """ note that the number of pivot nodes is the same as dimension. formate of points: G=(V, E) |V|=n, dimensions = m = pivots d(vi, pj) denotes a distance computered by Dijkstra's algorithm in a G. p1 p2 p3 ... pm v1 d(v1, p1) d(v1, p2) d(v1, p3) d(v1, pm) v2 . v3 . v4 . . . . . . . . . . vn d(vn, p1) ... d(vn, pm) """ self.points = list( map( lambda i: tuple(self.distance_matrix[i - 1, p - 1] for p in self.pivot_nodes), self.node_range)) if self.normalization is True: ############################################################################################################## self.points = self.nomalization_distance_mtrix( distance_matrix=self.points) # nomalized self.points ############################################################################################################## logging.info('project into a low dimension use PCA') if self.version == "HDE-SV": if self.dimension == 2: self.transformed_points = np.array(self.points) """ PCA: input array-like: shape of self.points = (n_sample, n_feature) output array-like: shape of self.transformed_points = (n_sample, n_component) """ if self.version == "HDE": # PCA denotes that algorithm uses PCA to decomposite original space. pca = PCA(n_components=2, copy=True) self.transformed_points = pca.fit_transform(self.points) if self.version == "HDE-Level": # PCA denotes that algorithm uses PCA to decomposite original space. pca = PCA(n_components=3, copy=True) self.transformed_points = pca.fit_transform(self.points) pca = PCA(n_components=2, copy=True) self.transformed_points = pca.fit_transform( self.transformed_points) ''' replace initial version as paper. by mty 2017-8-9 ''' if self.version == "HDE-PIT": # PIT denotes that algorithm uses poweriteration to computer eigenvectors for decomposition space. X, S = self.covariance(self.points) # X = np.array(self.points).T # X = X.astype(float) U = self.poweriteration(S, epsilon=self.epsilon) self.transformed_points = self.decomposition_space(X, U) if self.node_count == (self.edge_count + 1): # determine wether it is a tree. FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature, cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion) # use FR to fine-tune self.transformed_points = FR.apply_force_directed_algorithm( iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points) if self.version == "HDE-MDS": # HDE-MDS denotes that algorithm combines with MDS. hde_mds = MDS() # MDS object self.transformed_points = hde_mds.fit_transform(self.points) if self.version == "Pivot-MDS": # Pivot-MDS denotes that original version of Pivot MDS. pivot_mds = PivotMDS(d=self.distance_matrix, pivots=self.dimension) # PivotMDS object self.transformed_points = pivot_mds.optimize() if self.version == "HDE-FICA": # FICA denotes that algorithm uses Fast ICA to decomposite original space. # fun, Could be either 'logcosh', 'exp', or 'cube'. fica = FastICA(n_components=2) # print(np.array(self.points).shape) self.transformed_points = fica.fit_transform(self.points) # print(np.array(self.transformed_points).shape) # FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature, # cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion) # # use FR to fine-tune # self.transformed_points = FR.apply_force_directed_algorithm(iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points) if self.version == "HDE-KPCA": # FPCA denotes that algorithm uses kernel PCA to decomposite original space. kpca = KernelPCA(n_components=2, kernel=self.kpca_fun, gamma=self.gamma) self.transformed_points = kpca.fit_transform(self.points) if self.version == "HDE-NMF": nmf = NMF(n_components=2) self.transformed_points = nmf.fit_transform(self.points) if self.version == "HDE-TruncatedSVD": tsvd = TruncatedSVD(n_components=2) self.transformed_points = tsvd.fit_transform(self.points) if self.version == "HDE-LDA": lda = LinearDiscriminantAnalysis(n_components=2) y = [] for i in range(self.node_count): y.append(1) y = np.array(y) lda = lda.fit(self.points, y=y) self.transformed_points = lda.transform(self.points) if self.version == "HDE-FR": pca = PCA(n_components=2, copy=True) self.transformed_points = pca.fit_transform(self.points) if self.node_count == (self.edge_count + 1): # determine wether it is a tree. FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature, cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion) # use FR to fine-tune self.transformed_points = FR.apply_force_directed_algorithm( iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points) if self.version == "HDE-FICA-FR": fica = FastICA(n_components=2) self.transformed_points = fica.fit_transform(self.points) if self.node_count == (self.edge_count + 1): # determine wether it is a tree. FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature, cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion) # use FR to fine-tune self.transformed_points = FR.apply_force_directed_algorithm( iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points) if self.version == "HDE-TSNE-FR": # pca = PCA(n_components=10, copy=True) # self.transformed_points = pca.fit_transform(self.points) tsne = TSNE(learning_rate=self.learning_rate, init=self.init ) # 'init' must be 'pca', 'random', or a numpy array self.transformed_points = tsne.fit_transform(self.points) if self.node_count == (self.edge_count + 1): # determine wether it is a tree. FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature, cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion) # use FR to fine-tune self.transformed_points = FR.apply_force_directed_algorithm( iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points) if self.version == "HDE-SPE": IP = SpectralEmbedding(n_components=2) self.transformed_points = IP.fit_transform(self.distance_matrix) # pca = PCA(n_components=2, copy=True) # self.transformed_points = pca.fit_transform( self.transformed_points) return self.node_count, self.edge_count
y = clusters #predicted labels error_analysis = contingency_matrix(x, y) #***************************plot************************************************ from sklearn.metrics.pairwise import cosine_similarity dist = 1 - cosine_similarity(X) import matplotlib.pyplot as plt from sklearn.manifold import MDS MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] #set up colors per clusters using a dict cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3'} #set up cluster names using a dict cluster_names = {0: 'first book', 1: 'third book', 2: 'second book'} #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, label=clusters))
def apply_pca_analysis(df, params): '''df is held_units dataframe grouped by exp_name, exp_group, time_group only contains units held over preCTA or postCTA, no units held from ctaTrain to ctaTest Parameters ---------- Returns ------- Raises ------ ''' bin_size = params['pca']['win_size'] bin_step = params['pca']['step_size'] time_start = params['pca']['time_win'][0] time_end = params['pca']['time_win'][1] smoothing = params['pca']['smoothing_win'] n_cells = len(df) rd1 = df['rec1'].unique() rd2 = df['rec2'].unique() if len(rd1) > 1 or len(rd2) > 1: raise ValueError('Too many recording directories') rd1 = rd1[0] rd2 = rd2[0] units1 = list(df['unit1'].unique()) units2 = list(df['unit2'].unique()) dim1 = load_dataset(rd1).dig_in_mapping.set_index('channel') dim2 = load_dataset(rd2).dig_in_mapping.set_index('channel') if n_cells < 2: # No point if only 1 unit exp_name = os.path.basename(rd1).split('_') print('%s - %s: Not enough units for PCA analysis' % (exp_name[0], exp_name[-3])) return time, sa = h5io.get_spike_data(rd1, units1) fr_t, fr, fr_lbls = get_pca_data(rd1, units1, bin_size, step=bin_step, t_start=time_start, t_end=time_end) rates = fr labels = fr_lbls time = fr_t # Again with rec2 fr_t, fr, fr_lbls = get_pca_data(rd2, units2, bin_size, step=bin_step, t_start=time_start, t_end=time_end) rates = np.vstack([rates, fr]) labels = np.vstack([labels, fr_lbls]) # So now rates is tastes*trial*times X units # Do PCA on all data, put in (trials*time)xcells 2D matrix # pca = MDS(n_components=2) pca = PCA(n_components=2) pc_values = pca.fit_transform(rates) mds = MDS(n_components=2) md_values = mds.fit_transform(rates) out_df = pd.DataFrame(labels, columns=['taste', 'trial', 'time']) out_df['n_cells'] = n_cells out_df[['PC1', 'PC2']] = pd.DataFrame(pc_values) out_df[['MDS1', 'MDS2']] = pd.DataFrame(md_values) # Compute the MDS distance metric using the full dimensional solution # For each point computes distance to mean Quinine / distance to mean NaCl mds = MDS(n_components=rates.shape[1]) mds_values = mds.fit_transform(rates) n_idx = np.where(labels[:, 0] == 'NaCl')[0] q_idx = np.where(labels[:, 0] == 'Quinine')[0] q_mean = np.mean(mds_values[q_idx, :], axis=0) n_mean = np.mean(mds_values[n_idx, :], axis=0) dist_metric = [ euclidean(x, q_mean) / euclidean(x, n_mean) for x in mds_values ] assert len( dist_metric) == rates.shape[0], 'computed distances over wrong axis' out_df['dQ_v_dN_fullMDS'] = pd.DataFrame(dist_metric) # Do it again with raw rates q_mean = np.mean(rates[q_idx, :], axis=0) n_mean = np.mean(rates[n_idx, :], axis=0) raw_metric = [euclidean(x, q_mean) / euclidean(x, n_mean) for x in rates] assert len( raw_metric) == rates.shape[0], 'computed distances over wrong axis' out_df['dQ_v_dN_rawRates'] = pd.DataFrame(raw_metric) return out_df
def project_face_space(self, kind='custom', **kwargs): ''' Create a leave-one-out space for each target stimulus ''' if kind == 'custom': # creating a target face space with all target faces target_all_space = self._cmdscale( self.rc_df)[0] # creating face space for all recon faces target_all_space_df = pd.DataFrame( target_all_space[:, :self.dims_n], index=list(self.rc_names)) for name in self.rc_names: # looping over all target faces # computing training and target conf df without loo name train_conf_df = self.tr_df.drop(columns=name, index=name).copy() # names without loo name train_names = train_conf_df.index # computing face spaces without loo target_space = target_all_space_df.drop(index=name) target_names = target_space.index train_space = pd.DataFrame( self._cmdscale(train_conf_df)[0][:, :self.dims_n], index=train_names) train_target_space = train_space.loc[target_names, :].values # computing procrustes projection using default code ? R, s = orthogonal_procrustes(train_target_space, target_space) #? train_space.loc[name] = np.dot( target_all_space_df.loc[name, :], R.T) * s # computing procrustes projection using Matlab imported code # d, Z, tform = self.procrustes(target_space.values, train_target_space) # train_space.loc[name] = tform['scale'] * np.dot(target_all_space_df.loc[name,:], tform['rotation']) + tform['translation'] # sorting and assigning to the dictionary train_space = train_space.sort_index(axis=0) self.loo_dict[name] = train_space else: mds_scale = MDS(n_components=self.dims_n, dissimilarity='precomputed', n_init=10, max_iter=1000) all_rc = mds_scale.fit_transform(self.rc_df.values) all_rc = pd.DataFrame(all_rc[:, :self.dims_n], index=list(self.rc_names)) for name in self.rc_names: # looping over all recon faces temp_rc_names = set(self.rc_names).difference( {name}) # excluding leave-one-out face name temp_tr = self.tr_df.drop( columns=name, index=name ) # excluding loo row and column in training data temp_tr_names = temp_tr.index # names left for training data temp_tr = pd.DataFrame( mds_scale.fit_transform(temp_tr), index=temp_tr_names) # mds on training data temp_rc_tr = temp_tr.loc[ temp_rc_names, :].values # matching faces from training face space to recon temp_rc = self.rc_df.loc[ temp_rc_names, temp_rc_names] # choosing recon confusibility matrix temp_rc = mds_scale.fit_transform( temp_rc) # running mds on recon data R, s = orthogonal_procrustes(temp_rc_tr, temp_rc[:, :self.dims_n]) temp_tr.loc[name] = np.dot(all_rc.loc[name, :], R) * s temp_tr = temp_tr.sort_index(axis=0) self.loo_dict[name] = temp_tr return self
import pickle_file from models import hierarchy from feature_extraction import doc2vec from feature_extraction import tf_idf from sklearn.metrics.pairwise import cosine_similarity from sklearn.manifold import MDS books = pickle_file.load('books.pkl') doc = books['contents'][1:].tolist() # Use doec2vec to perform feature extracture feature_vectors = pickle_file.load('doec2vec.pkl') # Use tf-idf to extract the features and calculate the dissimilarity bt cosine simularity vectorizer, corpus_matrix, feature_names = tf_idf.get_tfidf_model(doc) distVectors = 1 - cosine_similarity(corpus_matrix) mds = MDS(n_components=2000, random_state=1, dissimilarity="precomputed", metric=True) bookFeatures = mds.fit_transform(distVectors) # output using feature extractor: doc2vec hierarchy.ward_cluster(6, feature_vectors, 'doc2vec') # output using feature extractor: tf-idf hierarchy.ward_cluster(6, bookFeatures, 'tf-idf')
# IU - International University of Applied Science # Machine Learning - Unsupervised Machine Learning # Course Code: DLBDSMLUSL01 # Multi-Dimensional Scaling (MDS) #%% import libraries import numpy as np from sklearn import datasets import matplotlib.pyplot as plt from sklearn.manifold import MDS from sklearn.preprocessing import MinMaxScaler #%% load the sample data set iris = datasets.load_iris() X = iris.data #%% normalize the data X_scaled = MinMaxScaler().fit_transform(X) #%% conduct MDS on the data mds = MDS(2,random_state=0) X_2d = mds.fit_transform(X_scaled) #%% # Plot the projected Iris data points into the reduced # feature space by MDS plt.scatter(x=X_2d[:,0], y=X_2d[:,1], c=iris.target) plt.show()
#lle = LocallyLinearEmbedding(n_neighbors=5, n_components=3, eigen_solver='arpack', max_iter=3000) #lle = LocallyLinearEmbedding(n_components=3, eigen_solver='arpack', geom=geom) #out_coords = lle.fit_transform(masked_mean, input_type='adjacency') #out_coords = lle.fit_transform(masked_mean) #init = np.array([p.cm_pos for p in out_conf._nucleotides]) #Run multidimensional scaling on the average distances to find average positions from sklearn.manifold import MDS mds = MDS(n_components=3, metric=True, max_iter=3000, eps=1e-12, dissimilarity="precomputed", n_jobs=1, n_init=1) out_coords = mds.fit_transform( masked_mean) #, init=init) #this one worked best #Overwrite the system we made earlier with the coordinates calculated via MDS for i, n in enumerate(output_system._nucleotides): n.cm_pos = out_coords[i] n._a1 = np.array([0, 0, 0]) n._a3 = np.array( [0, 0, 0] ) #since the orientation vectors are all 0, this cannot be used in a simulation, but the viewer will handle it #Write the mean structure out as a new .dat and .top pair output_system.print_lorenzo_output("{}.dat".format(meanfile), "{}.top".format(meanfile)) print("INFO: wrote output files: {}.dat, {}.top".format( meanfile, meanfile), file=stderr)
fasta_sequences = SeqIO.parse(open('HW4.fas'), 'fasta') for fasta in fasta_sequences: seqList.append(fasta.seq) # initializing the hamming list of elements to zeros hammList = np.zeros((len(seqList), len(seqList))) # building the hamming distance matrix # the shape of this matrix is 120 X 120 for i in range(len(seqList)): for j in range(len(seqList)): hammingDist = calcHammDist(seqList[i], seqList[j]) hammList[i][j] = hammingDist # performing the multi-dimensional scaling on the hamming distance matrix # n_components = 2 - scaling the dimensions to two # dissimilarity = 'precomputed' - means that we are specifying the mds that # dissimilarities is already computed in the form of pairwise hamming distances embedding = MDS(n_components=2, dissimilarity='precomputed') hamm_transformed = embedding.fit_transform(hammList) # forming the x-coordinates and y-coordinates x = hamm_transformed[:, 0] y = hamm_transformed[:, 1] # performing the visualization scatterPlotVisualizer(x, y) # write MDS Data to CSV file writeMDSDataToCSV(hamm_transformed)
def plot_clusters(num_clusters, feature_matrix, cluster_data, movie_data, plot_size=(16, 8)): # generate random color for clusters def generate_random_color(): color = '#%06x' % random.randint(0, 0xFFFFFF) return color # define markers for clusters markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd'] # build cosine distance matrix cosine_distance = 1 - cosine_similarity(feature_matrix) # dimensionality reduction using MDS mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # get coordinates of clusters in new low-dimensional space plot_positions = mds.fit_transform(cosine_distance) x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1] # build cluster plotting data cluster_color_map = {} cluster_name_map = {} for cluster_num, cluster_details in cluster_data.items(): # assign cluster features to unique label cluster_color_map[cluster_num] = generate_random_color() cluster_name_map[cluster_num] = ', '.join( cluster_details['key_features'][:5]).strip() # map each unique cluster label with its coordinates and movies cluster_plot_frame = pd.DataFrame({ 'x': x_pos, 'y': y_pos, 'label': movie_data['Cluster'].values.tolist(), 'title': movie_data['Title'].values.tolist() }) grouped_plot_frame = cluster_plot_frame.groupby('label') # set plot figure size and axes fig, ax = plt.subplots(figsize=plot_size) ax.margins(0.05) # plot each cluster using co-ordinates and movie titles for cluster_num, cluster_frame in grouped_plot_frame: marker = markers[cluster_num] if cluster_num < len(markers) \ else np.random.choice(markers, size=1)[0] ax.plot(cluster_frame['x'], cluster_frame['y'], marker=marker, linestyle='', ms=12, label=cluster_name_map[cluster_num], color=cluster_color_map[cluster_num], mec='none') ax.set_aspect('auto') ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off') fontP = FontProperties() fontP.set_size('small') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=True, ncol=5, numpoints=1, prop=fontP) #add labels as the film titles for index in range(len(cluster_plot_frame)): ax.text(cluster_plot_frame.ix[index]['x'], cluster_plot_frame.ix[index]['y'], cluster_plot_frame.ix[index]['title'], size=8) plt.savefig('clusters_requirementUC.png', dpi=200) # show the plot plt.show()
from dirty_cat import SimilarityEncoder similarity_encoder = SimilarityEncoder(similarity='ngram') transformed_values = similarity_encoder.fit_transform( sorted_values.reshape(-1, 1)) ######################################################################### # Plotting the new representation using multi-dimensional scaling # ................................................................ # # Let's now plot a couple points at random using a low-dimensional representation # to get an intuition of what the similarity encoder is doing: from sklearn.manifold import MDS mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42) two_dim_data = mds.fit_transform(1 - transformed_values) # transformed values lie # in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix print(two_dim_data.shape) print(sorted_values.shape) ######################################################################### # We first quickly fit a KNN so that the plots does not get too busy: import numpy as np n_points = 5 np.random.seed(42) from sklearn.neighbors import NearestNeighbors random_points = np.random.choice(len(similarity_encoder.categories_[0]), n_points, replace=False)
)) fig.update_layout(title="heat map", xaxis_nticks=36) fig.show() # parallel fig = px.parallel_coordinates( csv_mydata, color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2) fig.show() # get PCA and MDS data mds = MDS(n_components=2) mds.fit(data) mds_data = mds.fit_transform(data) pca = PCA(n_components=2) pca.fit(data) pca_data = pca.fit_transform(data) # PCA csv_mydata['pca_x'] = np.array(pca_data).T[0] csv_mydata['pca_y'] = np.array(pca_data).T[1] fig = px.scatter(csv_mydata, x="pca_x", y="pca_y", color="year", title="PCA chart") fig.show() # MDS
X95 = pca.fit_transform(X) pca.n_components_ #%% PCA Inverse Transform Xrestore = pca.inverse_transform(X95) plt.plot(Xrestore[0],X[0],'ro') #%% Incremental PCA X_mm = np.memmap('X.pkl',shape=(32567, 472)) from sklearn.decomposition import IncrementalPCA inc_pca = IncrementalPCA(n_components=100, batch_size=1000) inc_pca.fit(X_mm) #%% MDS, Isomap, and T-SNE from sklearn.manifold import MDS, Isomap, TSNE mds = MDS(n_components=2) Xmds = mds.fit_transform(X[:500,:200]) Axes3D(plt.figure()).scatter(Xmds[:,0],Xmds[:,1], alpha=.3) #%% Isomap iso = Isomap(n_components=2) Xiso = iso.fit_transform(X[:500,:200]) Axes3D(plt.figure()).scatter(Xiso[:,0],Xiso[:,1], alpha=.3) #%% t-SNE tsne = TSNE(n_components=2, n_iter=250) Xtsne = tsne.fit_transform(X[:500,:200]) Axes3D(plt.figure()).scatter(Xtsne[:,0],Xtsne[:,1], alpha=.3) #%% PC Regression lin_reg = LinearRegression() scores = cross_val_score(lin_reg, X95[:,:10], Y) scores.mean()
def calculate_and_cluster(): global names global data_list global data_tag_map global matrix_list global data_tagged_list data_list = {} data_tag_map = {} data_tagged_list = {} matrix_list = [] counter = 0 # Parse the CSV file (this will be denoted by a string variable) with open('../../../data/sets/complete_set.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: data_list[counter] = ''.join(row) counter += 1 # Loop through data in range for data in range(0, len(data_list)): # Split the last token in the string split = data_list[data].split(" ")[-1:] # print split[0], "Tag set: ", get_tag_set(split[0]) data_tag_map[split[0]] = get_tag_set(split[0]) od = OrderedDict(sorted(data_tag_map.items())) names = [] counter = 0 for key, value in od.iteritems(): # Maintain old file name file_old = str(counter) + '.txt' tag = '' if len(value) == 1: tag = 'Tagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = True else: tag = 'Untagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = False # Create new file name with tagged / untagged appended file_new = str(counter) + '_' + tag + '.txt' # Rename the file for later use in color co-ordination rename_file(file_old, file_new) counter += 1 dataNodes = [] for x in range(0, len(data_list)): dataNodes.append(data_list[x]) # Generate matrix from file X = genfromtxt('matrix.csv', delimiter=',') # Symmetrize X to ensure the matrix is valid X = symmetrize(X) # Put matrix in a list for checking matrix_list = X.tolist() for x in range(0, len(matrix_list)): tagged = get_tagged(str(x)) if (not tagged): tag_nearest_neighbour(x) # Check symmetry print "Symmetric? " + str((X.transpose() == X).all()) # N Components: plotting points in a two-dimensional plane # Dissimilirity: "precomputed" because of the Distance Matrix # Random state is fixed so we can reproduce the plot. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) mds.fit(X.astype(np.float64)) pos = mds.fit_transform(X) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # Set figure size to have dimensions of at least 15 inches for the width. # Height can be scaled accordingly. plt.figure(figsize=(15, 8)) plt.subplot(211) # Loop through the points, label appropriately and scatter # Ensure figure size has enough room for legend plotting. Each plot must have a label. # In this case, label is the split value denoting the POI tag for x, y, name in zip(xs, ys, names): plt.scatter(x, y, s=100, c=get_colour_tag(name.split('_', 1)[1]), label=name.split('_', 1)[1]) #plt.text(x,y,name.split('_',1)[0]) handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys(), loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.6)) plt.show() # Create a denodrogram linkage_matrix = ward(X) # match dendrogram to that returned by R's hclust() dendrogram(linkage_matrix, orientation="right") plt.tight_layout() plt.show()
print 'Explained Variance', pca.explained_variance_ratio_[:2] # Haven't yet put this info on the axes like biplot() does! plt.show() ## COSINE SIMILARITY/MDS GRAPH # biplot modded for MDS with Cosine Similarity # MDS, naturally mds = MDS(n_components=2) new_array = np.concatenate((bag_array, grc_array), axis=0) # incl. IG vector with suspects' distances = 1 - cosine_similarity(new_array) # Scales Cosine Distances into 2-D space new_mds = mds.fit_transform(new_array) # how we'll distinguish IG from the suspects mds_markers = ['x'] * len(labels) + ['o'] mds_labels = labels[:] mds_labels.append(labels[-1] + 1) # coordinates of 2-D scaled slice vectors xs = new_mds[:, 0] ys = new_mds[:, 1] # plotting points plt.figure(1, figsize=(10, 10), dpi=200) for i in range(len(xs)): plt.plot(xs[i], ys[i],
def k_means_cluster(self, num_clusters=8): """ KMeans Cluster at the document level. Separates samples into n groups of equal variance by within-cluster sum-of-squares. The square is taken of the tfidf matrix row entries, then summed for distance to each centroid. Clusters are initialized semi-randomly (inital clusters far from each other) """ tfidf_vectorizer = TfidfVectorizer(max_df=0.50, max_features=200000, min_df=0.02, stop_words=STOPWORDS, use_idf=True, ngram_range=(1, 2)) tfidf_matrix = tfidf_vectorizer.fit_transform([ ' . '.join(doc) for doc in self.corpus.tokenized ]) #fit the vectorizer to synopses dist = 1 - cosine_similarity(tfidf_matrix) km = KMeans(n_clusters=num_clusters, max_iter=300, init='k-means++') km = km.fit(tfidf_matrix) terms = tfidf_vectorizer.get_feature_names() print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] for cluster in range(num_clusters): print("Cluster {0} words: {1}".format( cluster + 1, ' | '.join( [terms[ind] for ind in order_centroids[cluster, :3]]))) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # PCA is a method of MDS positions = mds.fit_transform(dist) xs, ys = positions[:, 0], positions[:, 1] labels = km.labels_.tolist() if self.dirname: titles = get_file_list(dirname) titles = [label[-8:-4] for label in labels] # KATHY ONLY: get year else: titles = [' ' + str(int(label)) for label in labels] df = pd.DataFrame(dict(x=xs, y=ys, label=labels, title=titles)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=8, label='Cluster ' + str(name), mec='none') ax.set_aspect('auto') ax.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) ax.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelleft=False) ax.legend(numpoints=1) #add label in x,y position with the label as the film title for i in range(len(df)): ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['title'], size=8) plt.show()
def MDSEmbedding(dimension_factor,distance_matrix): embedding = MDS(n_components=dimension_factor,dissimilarity='precomputed',metric=True,random_state=42) MDS_fix_matrix=embedding.fit_transform(distance_matrix) return MDS_fix_matrix
import cPickle as pk from sklearn.manifold import MDS import numpy as np import matplotlib.pyplot as plt newspaper = 'lanacion' number_of_topics = 62 notes = [pk.load(file('Data03-05/{}_topic{}_vect.pk'.format(newspaper, i),'r')) \ for i in range(number_of_topics)] diss_matrix = np.array([[np.abs(np.log(notes[i].dot(notes[j]))) \ for i in range(number_of_topics)] for j in range(number_of_topics)]) mds = MDS(n_components=2, dissimilarity='precomputed') x_mds = mds.fit_transform(diss_matrix) plt.scatter(x_mds[:, 0], x_mds[:, 1], alpha=0.25, s=100) for i in range(number_of_topics): plt.text(x_mds[i, 0], x_mds[i, 1], str(i)) plt.grid('on') plt.show()
data = [] lang_data = [] gender_data = [] for k in utt_list: data.append(utt2data[k]) lang_data.append(utt2lang[k]) if args.utt2gender: gender_data.append(utt2gender[k]) data = np.matrix(data) lang_data = np.array(lang_data) if args.utt2gender: gender_data = np.array(gender_data) embedding = MDS(n_components=2) data_transformed = embedding.fit_transform(data) # df = pd.DataFrame(dict(x=data_transformed[:,0], y=data_transformed[:,1], label=lang_data)) df = pd.DataFrame( dict(x=data_transformed[:, 0], y=data_transformed[:, 1], label=lang_data, gender=gender_data)) if not args.utt2gender: groups = df.groupby('label') else: groups = df.groupby(['label', 'gender']) fig, ax = plt.subplots() ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
dist_all_rot[ind1, ind2] = sp.linalg.norm(sub_data[:, :, ind1] - sub_data_rot) print(ind1, ind2) sp.savez('ADHD_pairwise_dist.npz', dist_all_rot=dist_all_rot, dist_all_orig=dist_all_orig, normSub=normSub) ###### #%% a = sp.load('ADHD_pairwise_dist.npz') lst = a['lst'] q = sp.argmin(a['dist_all_rot'][:-1, :-1].sum(1)) print('The representative subject is: %s ' % lst[q]) m = MDS(n_components=2, dissimilarity='precomputed') e = m.fit_transform(a['dist_all_rot']) print(e) fig, ax = plt.subplots() ax.scatter(e[:, 0], e[:, 1]) for i in range(e.shape[0]): ax.annotate(lst[i], (e[i, 0], e[i, 1])) #%% Compute difference diff = 0 q = 3 for ind in range(15): Y2, _ = brainSync(X=sub_data[:, :, q], Y=sub_data[:, :, ind]) diff += (Y2 - sub_data[:, :, q])**2 print(ind, end=',') spio.savemat('ADHD_norm_diff2sub1.mat', {'diff': diff})
plt.axis('off') plt.title("LLE MNIST Scatter Plot", fontsize = 14) save_fig("LLE MNIST Scatter Plot") plt.show() # In[27]: mds = MDS(n_components=2, random_state=42) # In[28]: x_test_redux_mds = mds.fit_transform(x_test[:1000]) # In[29]: plt.figure(figsize = (12, 10)) plt.scatter(x_test_redux_mds[:, 0], x_test_redux_mds[:, 1], c = y_test[:1000], cmap = "gist_rainbow") plt.colorbar() plt.axis('off') plt.title("MDS MNIST Scatter Plot", fontsize = 14) save_fig("MDS MNIST Scatter Plot") plt.show() # In[ ]:
class xRepresentation: """ Perform dimensionality reduction and represent 2d matrices """ def __init__(self, **kwargs): """ input parameters """ self.__n_clusters = kwargs.get('n_clusters', 2) self.__pca_components = kwargs.get('pca_components', 3) self.__tsne_components = kwargs.get('tsne_components', 3) self.__components = kwargs.get('components', 2) self.__neighbors = kwargs.get('neighbors', 2) #10 self.__lsa_normalization = False #sparse matrix self.__matrix = None #tf-idf dataframe self.__tfidf_dataframe = pd.DataFrame() self.__corpora_names = None self.__feature_names = None # self.__clusters = xClusters(n_clusters=self.__n_clusters) #angular similarity self.__tfidf_angle_similarity_dataframe = pd.DataFrame() #pca self.__pca = PCA(n_components=self.__pca_components) #svd/lsa self.__svd = TruncatedSVD(n_components=self.__components, n_iter=7, random_state=33) #t-sne self.__tsne = TSNE(n_components=self.__components) #MDS self.__mds = MDS(n_components=self.__components, random_state=33) # manifold isomap for non-linear dimension reduction self.__isomap = Isomap(n_components=self.__components, n_neighbors=self.__neighbors, eigen_solver='auto') #cluster centers self.__kmeans_cluster_centers = None #normalizer self.__normalizer = Normalizer(copy=False) @property def clusters(self): """ returns an xCluster class object """ return self.__clusters @property def matrix(self) -> np.matrix: return self.__matrix @matrix.setter def matrix(self, matrix: csr_matrix): if matrix.size: self.__matrix = matrix.todense() @property def tfidf_dataframe(self) -> pd.DataFrame(): return self.__tfidf_dataframe @tfidf_dataframe.setter def tfidf_dataframe(self, df: pd.DataFrame() = None): self.__tfidf_dataframe = df @property def corpora_names(self) -> List[str]: return self.__corpora_names @corpora_names.setter def corpora_names(self, names=None): if names: self.__corpora_names = names @property def feature_names(self) -> List[str]: return self.__feature_names @feature_names.setter def feature_names(self, names=None): if names: self.__feature_names = names @property def tfidf_angle_similarity_dataframe(self): return self.__tfidf_angle_similarity_dataframe @tfidf_angle_similarity_dataframe.setter def tfidf_angle_similarity_dataframe(self, df: pd.DataFrame() = None): self.__tfidf_angle_similarity_dataframe = df def __normalize_matrix(self): transformer = self.__normalizer.fit(self.__matrix) # fit does nothing self.__matrix = transformer.transform(self.__matrix) def __fit_pca(self): #Fit the model with X. pca = self.__pca.fit(X=self.__matrix) #Apply dimensionality reduction to X. data2d = pca.transform(X=self.__matrix) #pca shape print("PCA data shape", data2d.shape) #calculate the cluster enters on the reduced data cluster_centers2d = pca.transform( self.__clusters.kmeans_cluster_centers) #number of components n_components = self.__pca.components_.shape[0] print("Number of PCA components", n_components) #cross-check if n_components != data2d.shape[1]: print(f'inconcistent PCA companents {n_components} ' f'and 2d data row colunmns {data2d.shape[1]}') return False y = self.__clusters.kmeans_clusters_pred pairs = list(range(0, n_components)) for i, j in zip(pairs, pairs[1:] + pairs[:1]): print("PCA components", i, j) #plot fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) ax.set_title('PCA') #plt.scatter(data2d[:, i], # data2d[:, j], # c = self.__kmeans_clusters_pred) for icluster in range(self.__clusters.n_actual_clusters): print( f"Cluster {icluster}/{self.__clusters.n_actual_clusters}") ax.scatter( np.array(data2d[y == icluster, i]), np.array(data2d[y == icluster, j]), s=100, #c = i, alpha=0.5, label=f"Cluster {icluster}") plt.scatter(cluster_centers2d[:, i], cluster_centers2d[:, j], marker='x', s=200, linewidths=3, c='r', label='Centroids') ax.set_xlabel(f"component {i}") ax.set_ylabel(f"component {j}") ax.legend(loc="best") plt.show() def __fit_lsa(self): #LSA/SVD results are not normalized #Normalization might be needed if self.__lsa_normalization: lsa = make_pipeline(self.__svd, self.__normalizer) lsa_data2d = lsa.fit_transform(X=self.__matrix) else: #Fit the model with X. lsa = self.__svd.fit(X=self.__matrix) #Apply dimensionality reduction to X. lsa_data2d = lsa.transform(X=self.__matrix) #calculate the cluster enters on the reduced data lsa_cluster_centers2d = lsa.transform( self.__kmeans_clusters.cluster_centers_) #plot plt.scatter(lsa_data2d[:, 0], lsa_data2d[:, 1], c=self.__kmeans_clusters_pred ) #kmeans_clusters_pred, kmeans_labels plt.scatter(lsa_cluster_centers2d[:, 0], lsa_cluster_centers2d[:, 1], marker='x', s=200, linewidths=3, c='r') plt.show() def __fit_tsne(self): #Fit X into an embedded space. #tsne = self.__tsne.fit(X=self.__matrix) #transform #data2d = self.__tsne.transform(X=self.__matrix) #Fit X into an embedded space and return that transformed output. #Output: Embedding of the training data in low-dimensional space. data2d = self.__tsne.fit_transform(X=self.__matrix) #embedding shape print("TSNE embedding shape", data2d.shape, self.__tsne.embedding_.shape) #calculate the cluster enters on the reduced data #cluster_centers2d = tsne.transform(self.__clusters.kmeans_cluster_centers) #array-like, shape (n_samples, n_components) n_components = self.__tsne.embedding_.shape[1] print("Number of TSNE components", n_components) y = self.__clusters.kmeans_clusters_pred pairs = list(range(0, n_components)) print(pairs) print(pairs[1:] + pairs[:1]) for i, j in zip(pairs, pairs[1:] + pairs[:1]): print("TSNE components", i, j) #plot fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) ax.set_title('TSNE') for icluster in range(self.__clusters.n_actual_clusters): print( f"Cluster {icluster}/{self.__clusters.n_actual_clusters}") ax.scatter( np.array(data2d[y == icluster, i]), np.array(data2d[y == icluster, j]), s=100, #c = i, alpha=0.5, label=f"Cluster {icluster}") #plt.scatter(cluster_centers2d[:, i], # cluster_centers2d[:, j], # marker='x', # s=200, # linewidths=3, # c='r', # label='Centroids') ax.set_xlabel(f"component {i}") ax.set_ylabel(f"component {j}") ax.legend(loc="best") plt.show() def __fit_mds(self): #Fit X data mds_data2d = self.__mds.fit_transform(X=self.__matrix) #plot plt.scatter(mds_data2d[:, 0], mds_data2d[:, 1], c=self.__kmeans_clusters_pred, cmap=plt.cm.Spectral) plt.show() def __fit_isomap(self): #Compute the embedding vectors for data X embed = self.__isomap.fit_transform(X=self.__matrix) #Semantic labeling of cluster. #Apply a label if the clusters max TF-IDF is in the 90% quantile #of the whole corpus of TF-IDF scores #clusterLabels = [] #t99 = scipy.stats.mstats.mquantiles(self.__matrix, [ 0.9])[0] #for i in range(0, vectorized.shape[0]): # row = vectorized.getrow(i) # if row.max() >= t99: # arrayIndex = numpy.where(row.data == row.max())[0][0] # clusterLabels.append(labels[row.indices[arrayIndex]]) # else: # clusterLabels.append('') # Plot the dimension reduced data plt.xlabel('reduced dimension-1') plt.ylabel('reduced dimension-2') for i in range(1, len(embed)): plt.scatter(embed[i][0], embed[i][1]) #c=self.__kmeans_clusters_pred) #plt.annotate(clusterLabels[i], # embed[i], # xytext=None, xycoords='data', textcoords='data', arrowprops=None) plt.show() def __display_tfidf(self): print('TF-IDF matrix') print(self.__tfidf_dataframe) #aggregations and ascending order estimates = {'mean': False, 'sum': False, 'max': False} #print(list(estimates.keys())) #print(estimates.values()) #df = self.__tfidf_dataframe.mean(axis=0) df = self.__tfidf_dataframe.agg(list(estimates.keys())).T df = df.sort_values(by=list(estimates.keys()), ascending=list(estimates.values())) df.index.name = 'term' title = 'tf-idf' N = 20 if N > 0: df = df.head(N) # same as df[:N] df = df.tail(N) # same as df[-N:] title += f' ({N} top terms)' #plt.figure(figsize=(10, 6)) ax = df[estimates.keys()].plot(kind='bar', title=title, figsize=(15, 10), legend=True, fontsize=12) ax.set_xlabel("term", fontsize=12) ax.set_ylabel("score", fontsize=12) plt.xticks(rotation=30, ha='right') plt.show() #store in txt file #print(df); def __display_angle_similarity(self): df = self.__tfidf_angle_similarity_dataframe mask = np.zeros_like(df, dtype=np.bool) mask[np.triu_indices_from(mask)] = True vmin = df.where(df > 0).min().min() vmax = df.max().max() #df.values.max() midpoint = (vmax - vmin) / 2 #smoother washed-out contours alpha = 0.10 vmin_new = vmin * (1 - alpha) vmax_new = vmax * (1 + alpha) vmin = vmin_new vmax = vmax_new if vmax_new < 90. else 90 print(f"Angle similarity min {vmin}, max {vmax}") fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) ax.set_title(f'TF-IDF document angle similarity)') sns.heatmap( df, mask=mask, annot=True, square=True, #forces the aspect ratio of the blocks to be equal fmt=".1f", vmin=vmin, vmax=vmax, #center=midpoint, cmap="coolwarm", annot_kws={'size': 10}, ax=ax) plt.show() def fit(self): """ user callable method to invoke fits """ self.__display_tfidf() self.__display_angle_similarity() print(type(self.__feature_names)) self.__clusters.fit(matrix=self.__matrix, dataframe=self.__tfidf_dataframe, corpora=self.__corpora_names, features=self.__feature_names) self.__fit_pca() #self.__fit_lsa() self.__fit_tsne()
def run(self): with self.input()[0] as i: graphIndex, GR = i.query() with self.input()[1] as i: R = i.query() dis = np.ones(GR.shape, dtype=GR.dtype) - GR colors = ['grey', 'green', 'red'] tName = ['UNKNOWN', 'IUV', 'ESBMC'] aName = ['UNKNOWN', 'Tester', 'Verificator'] lScore = np.zeros(len(GR)) lTime = np.zeros(len(GR)) for index, D in R.items(): if index not in graphIndex: continue gI = graphIndex[index] score = self.__evalScore(D['score']) time = self.__evalTime(D['time_rank']) for i, t in enumerate(tName): if score == t: lScore[gI] = i if time == t: lTime[gI] = i mds = MDS(n_components=2, dissimilarity="precomputed", n_init=10) X_r = mds.fit_transform(dis) stress = mds.stress_ plt.figure(1) plt.suptitle( 'MDS of GRAM dataset (h: %s, D: %s) [%s points] (Stress: %2.2f)' % (str(self.h), str(self.D), str(len(X_r)), stress)) plt.subplot(121) for color, i, t in zip(colors, range(len(aName)), aName): plt.scatter(X_r[lScore == i, 0], X_r[lScore == i, 1], color=color, alpha=.8, lw=2, label=t) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.subplot(122) for color, i, t in zip(colors, range(len(aName)), aName): plt.scatter(X_r[lTime == i, 0], X_r[lTime == i, 1], color=color, alpha=.8, lw=2, label=t) plt.legend(loc='best', shadow=False, scatterpoints=1) path = self.output().path directory = dirname(path) if not os.path.exists(directory): os.makedirs(directory) plt.savefig(path) plt.close()
ax.set_xticks(range(len(classes))) ax.set_xticklabels(labels, rotation=40, ha='left') ax.axhline(11.5, color='k') ax.axvline(11.5, color='k') plt.colorbar(im) plt.tight_layout() plt.show() ############################################################################## # Confusion matrix related to mental representations have been historically # summarized with dimensionality reduction using multi-dimensional scaling [1]. # See how the face samples cluster together. fig, ax = plt.subplots(1) mds = MDS(2, random_state=0, dissimilarity='precomputed') chance = 0.5 summary = mds.fit_transform(chance - confusion) cmap = plt.get_cmap('rainbow') colors = ['r', 'b'] names = list(conds['condition'].values) for color, name in zip(colors, set(names)): sel = np.where([this_name == name for this_name in names])[0] size = 500 if name == 'human face' else 100 ax.scatter(summary[sel, 0], summary[sel, 1], s=size, facecolors=color, label=name, edgecolors='k') ax.axis('off') ax.legend(loc='lower right', scatterpoints=1, ncol=2) plt.tight_layout() plt.show() ############################################################################## # References
def calculate_mds(data, type): distance_matrix = SK_Metrics.pairwise_distances(data, metric=type) mds = MDS(n_components=2, dissimilarity='precomputed') return mds.fit_transform(distance_matrix)
# In[201]: pivot_scaled = 1 / pivot pivot_scaled = pivot_scaled.replace(np.nan, 0) pivot_scaled.to_csv('lift_value_competitors.csv') # In[203]: from sklearn.manifold import MDS import matplotlib.pyplot as plt # In[204]: mds = MDS(2, random_state=0, dissimilarity='precomputed') lift_2D = mds.fit_transform(pivot_scaled) plt.rcParams['figure.figsize'] = [8, 8] plt.rc('font', size=17, weight='bold') x_list = [] y_list = [] label_list = [] fig = plt.figure(figsize=(12, 10)) for i in np.unique(pivot_scaled.columns): subset = lift_2D[pivot_scaled.columns == i] x = [row[0] for row in subset] y = [row[1] for row in subset] plt.scatter(x, y, s=300)
def project(self, weights={}, alpha=.5, verbose=False, delete_duplicates=False, method='tsne', implementation='openTSNE', condense_countries=False, **kwargs): w_dict = { 'year': 1., 'x': 1., 'y': 1., 'size': 1., 'color': 1., 'countries': 1. } for w in weights: w_dict[w] = weights[w] weights = np.array(list(w_dict.values()), dtype=np.float32) year_span = self.year_span() num_countries = len(self.countries()) if not condense_countries: @jit(nopython=True) def state_distance(a, b): a = a.astype(np.float32) b = b.astype(np.float32) if year_span == 0: year_dist = 0 else: year_dist = np.abs(a[0] - b[0]) / year_span x_dist = 1 - np.dot(a[1:6], b[1:6]) y_dist = 1 - np.dot(a[6:11], b[6:11]) size_dist = 1 - np.dot(a[11:16], b[11:16]) color_dist = 1 - np.dot(a[16:18], b[16:18]) if num_countries == 0: country_dist = 0 else: country_dist = np.linalg.norm(a[18:] - b[18:]) / np.sqrt( num_countries) dists = np.array([ year_dist, x_dist, y_dist, size_dist, color_dist, country_dist ], dtype=np.float32) return (dists * weights).sum() else: @jit(nopython=True) def state_distance(a, b): a = a.astype(np.float32) b = b.astype(np.float32) if year_span == 0: year_dist = 0 else: year_dist = np.abs(a[0] - b[0]) / year_span x_dist = 1 - np.dot(a[1:6], b[1:6]) y_dist = 1 - np.dot(a[6:11], b[6:11]) size_dist = 1 - np.dot(a[11:16], b[11:16]) color_dist = 1 - np.dot(a[16:18], b[16:18]) if num_countries == 0: country_dist = 0 else: lat1, long1 = a[18:20] lat2, long2 = b[18:20] lat1 = lat1 / 180. * 2 * np.pi lat2 = lat2 / 180. * 2 * np.pi delta_long = long1 - long2 gcdist = np.sin(lat1) * np.sin(lat2) gcdist += np.cos(lat1) * np.cos(lat2) * np.cos(delta_long) gcdist = np.arccos(gcdist) / np.pi spread_dist = np.abs(a[20] - b[20]) sel_dist = np.abs(a[21] - b[21]) / num_countries country_dist = (gcdist + spread_dist + sel_dist) dists = np.array([ year_dist, x_dist, y_dist, size_dist, color_dist, country_dist ], dtype=np.float32) return (dists * weights).sum() if delete_duplicates: # check if any weight is 0. encoded = self.encode() if w_dict['year'] == 0.: encoded[:, 0] = 0. if w_dict['x'] == 0.: encoded[:, 1:6] = 0. if w_dict['y'] == 0.: encoded[:, 6:11] = 0. if w_dict['size'] == 0.: encoded[:, 11:16] = 0. if w_dict['color'] == 0.: encoded[:, 16:18] = 0. if w_dict['countries'] == 0.: encoded[:, 18:] = 0. encoded, indices, counts = np.unique(encoded, axis=0, return_inverse=True, return_counts=True) self.counts = counts[indices] else: encoded = self.encode() if method == 'tsne': if implementation == 'openTSNE': if verbose: tsne = openTSNE(metric=state_distance, verbose=verbose, n_jobs=-1, **kwargs) else: tsne = openTSNE(metric=state_distance, n_jobs=-1, **kwargs) embedding = np.array(tsne.fit(encoded)) elif implementation == 'sklearn': tsne = sklearnTSNE(metric=state_distance, verbose=3 if verbose else 0) embedding = np.array(tsne.fit_transform(encoded)) elif method == 'mds': mds = MDS(n_components=2, metric=True, dissimilarity='precomputed') distmat = squareform(pdist(encoded, state_distance)) embedding = mds.fit_transform(distmat) elif method == 'umap': umap = UMAP(metric=state_distance, verbose=verbose, **kwargs) embedding = np.array(umap.fit_transform(encoded)) elif method == 'hybrid': if not delete_duplicates: raise Warning('Hybrid layout always deletes duplicates!') adj = nx.adj_matrix(self.make_graph()).toarray() # adjacency matrix of undirected multigraph intermediate = np.zeros_like(adj, dtype=np.int) for (i, j), item in np.ndenumerate(adj): intermediate[i, j] += item intermediate[j, i] += item # use inverse number of connections as weights edges = [] edge_weights = [] for (i, j), item in np.ndenumerate(intermediate): if item != 0 and i <= j: edges.append((i, j)) edge_weights.append(item) # construct weighted graph and calculate path lengths g = nx.Graph() for e, w in zip(edges, edge_weights): g.add_edge(*e, weight=1 / w) path_lengths = dict(nx.all_pairs_dijkstra_path_length(g)) # construct distmat from path_lengths graph_distmat = np.zeros_like(adj, dtype=np.float) graph_distmat -= np.inf for i in path_lengths: dists = path_lengths[i] for j in dists: graph_distmat[i, j] = dists[j] graph_distmat = graph_distmat / graph_distmat.max() graph_distmat[graph_distmat == -np.inf] = 2. attr_distmat = squareform( pdist(np.unique(self.encode().astype(np.float32), axis=0), metric=state_distance)) init = 'random' if hasattr(alpha, '__iter__'): self.hybrid_alphas = alpha embedding = [] for a in alpha: distmat = (1 - a) * graph_distmat + a * attr_distmat tsne = openTSNE(metric='precomputed', initialization=init) embedding.append(tsne.fit(distmat)) else: distmat = (1 - alpha) * graph_distmat + alpha * attr_distmat tsne = openTSNE(metric='precomputed', initialization=init) embedding = tsne.fit(distmat) init = embedding[-1] if delete_duplicates: if method == 'hybrid' and hasattr(alpha, '__iter__'): embedding = np.stack([e[indices] for e in embedding]) else: embedding = embedding[indices] indices = np.add.accumulate(self.lengths()) if method == 'hybrid' and hasattr(alpha, '__iter__'): self.embedding = [ np.array_split(e, indices)[:-1] for e in embedding ] else: self.embedding = np.array_split(embedding, indices)[:-1] self.projected = True
#!/usr/bin/env python # coding: utf-8 import numpy as np import scipy.spatial.distance from sklearn.manifold import MDS from scipy.spatial.distance import pdist, squareform import matplotlib.pyplot as plt np.random.seed(seed=668) data = np.random.rand(1000, 100) D = pdist(data, metric='euclidean') D = squareform(D) mds = MDS(n_components=2, dissimilarity='precomputed') fit_t = mds.fit_transform(D) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title('MDS of random data') ax.set_xlabel('PC-1') ax.set_ylabel('PC-2') ax.scatter(x=fit_t[:, 0], y=fit_t[:, 1], c='r') plt.show()
plt.plot(feature_vector1, feature_vector2, 'ro') plt.show() #Multidimentional Scaling def findhamdist(str1, str2): diffs = 0 for k in xrange(len(str1)): if str1[k] != str2[k]: diffs += 1 return diffs rowscolumn = 62 MDS_Matrix = np.zeros((62, 62)) for i in xrange(rowscolumn): for j in xrange(rowscolumn): MDS_Matrix[i][j] = findhamdist(X_Matrix[i, :], X_Matrix[j, :]) print(MDS_Matrix) model = MDS(n_components=2, dissimilarity='precomputed', random_state=6) out = model.fit_transform(MDS_Matrix) print(out) plt.title('MultiDimentional Scaling') plt.xlabel('feature_vector1') plt.ylabel('feature_vector2') plt.scatter(out[:, 0], out[:, 1]) plt.show()
def main(): #data = pd.read_csv(os.path.join(path, 'data\headlines_cleaned.txt'), names=['text']) data = pd.read_csv( 'C:/Users/Mahmudur Limon/Downloads/data/jupiter Code/preprocessed data.csv', engine='python') data = data[['Unnamed: 0', 'title', 'text']] data = data.rename(index=str, columns={'Unnamed: 0': 'id'}) # text data in dataframe and removing stops words stop_words = set(stopwords.words('english')) #data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) # Using TFIDF vectorizer to convert convert words to Vector Space tfidf_vectorizer = TfidfVectorizer(max_features=200000, use_idf=True, stop_words='english', tokenizer=tokenize_and_stem) # Fit the vectorizer to text data tfidf_matrix = tfidf_vectorizer.fit_transform(data['text']) terms = tfidf_vectorizer.get_feature_names() # print(terms) # Kmeans++ km = KMeans(n_clusters=7, init='k-means++', max_iter=300, n_init=1, verbose=0, random_state=3425) km.fit(tfidf_matrix) labels = km.labels_ clusters = labels.tolist() # Calculating the distance measure derived from cosine similarity distance = 1 - cosine_similarity(tfidf_matrix) # Dimensionality reduction using Multidimensional scaling (MDS) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(distance) xs, ys = pos[:, 0], pos[:, 1] # Saving cluster visualization after mutidimensional scaling for x, y, in zip(xs, ys): plt.scatter(x, y) plt.title('MDS output of News Headlines') plt.savefig(os.path.join(path, 'results\MDS.png')) # Creating dataframe containing reduced dimensions, identified labels and text data for plotting KMeans output df = pd.DataFrame(dict(label=clusters, data=data['text'], x=xs, y=ys)) df.to_csv(os.path.join(path, 'results\kmeans_clustered_DF.txt'), sep=',') label_color_map = { 0: 'red', 1: 'blue', 2: 'green', 3: 'pink', 4: 'purple', 5: 'yellow', 6: 'orange', 7: 'grey' } csv = open(os.path.join(path, 'results\kmeans_clustered_output.txt'), 'w') csv.write('Cluster Headline\n') fig, ax = plt.subplots(figsize=(17, 9)) for index, row in df.iterrows(): cluster = row['label'] label_color = label_color_map[row['label']] label_text = row['data'] ax.plot(row['x'], row['y'], marker='o', ms=12, c=label_color) row = str(cluster) + ',' + label_text + '\n' csv.write(row) # ax.legend(numpoints=1) for i in range(len(df)): ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['label'], size=8) plt.title('News Headlines using KMeans Clustering') plt.savefig(os.path.join(path, 'results\kmeans.png'))