def embed(self, DM): """Embed a distance matrix using MDS. Parameters ---------- M : :obj:`ndarray` The distance matrix to be embedded Returns ------- :obj:`ndarray` A :obj:`ndarray` of the embedding. """ mds = MDS(n_components=self.num_components, dissimilarity="precomputed") mds.fit(DM.getMatrix()) emb = mds.embedding_ emb = pd.DataFrame(emb) emb.index = DM.D.index emb.index.name = DM.D.index.name name = DM.DS.name + " " + \ DM.metric_name + " " + \ self.embedding_name EDS = lds.DataSet(emb, name) return EDS
def plot_cities(): #distance_matrix = get_distances() cities = 'BOS CHI DC DEN LA MIA NY SEA SF'.split() distance_matrix = np.array([ [0 , 963 , 429 , 1949, 2979, 1504, 206 , 2976, 3095], [963 , 0 , 671 , 996 , 2054, 1329, 802 , 2013, 2142], [429 , 671 , 0 , 1616, 2631, 1075, 233 , 2684, 2799], [1949, 996 , 1616, 0 , 1059, 2037, 1771, 1307, 1235], [2979, 2054, 2631, 1059, 0 , 2687, 2786, 1131, 379], [1504, 1329, 1075, 2037, 2687, 0 , 1308, 3273, 3053], [206 , 802 , 233 , 1771, 2786, 1308, 0 , 2815, 2934], [2976, 2013, 2684, 1307, 1131, 3273, 2815, 0 , 808], [3095, 2142, 2799, 1235, 379 , 3053, 2934, 808 , 0] ]) # assert symmetric for (i, j) in [(i, j) for i in range(0, 8) for j in range(0, 8)]: try: assert(distance_matrix[i][j] == distance_matrix[j][i]) except AssertionError: print((i, j)) print(distance_matrix) mds = MDS(dissimilarity='precomputed') mds.fit(distance_matrix) print(mds.embedding_) for idx, points in enumerate(mds.embedding_): plt.plot(points[0], points[1], 'r.') plt.text(points[0], points[1], cities[idx]) plt.show() return
def timeline_scatter_plot(X, time_index, method='MDS', metric='cosine', **kwargs): if not isinstance(time_index, pd.DatetimeIndex): time_index = pd.DatetimeIndex(time_index) dm = pairwise_distances(X, metric=metric) if method.upper() == 'MDS': decomposer = MDS(n_components=2, dissimilarity='precomputed', verbose=1, **kwargs) decomposer.fit(dm) elif method.upper() == 'TSNE': decomposer = TSNE(n_components=2, metric='precomputed', verbose=1, **kwargs) decomposer.fit(dm) else: raise ValueError("Method %s is not supported..." % method) X, Y = decomposer.embedding_[:, 0], decomposer.embedding_[:, 1] unique_index = time_index.unique().order() colormap = { time_stamp: color for time_stamp, color in zip( unique_index, sns.cubehelix_palette(unique_index.shape[0])) } colors = [colormap[time_stamp] for time_stamp in time_index] sns.plt.scatter(X, Y, s=40, color=colors, alpha=0.7) sns.plt.axis('off')
def perform_MDS_analysis(n_samples=10e10, n_variables=10000, data_type='psi', filter_tissues=True, n_dimensions=2, metric=True): """ Performs the MDS of the PSI/TPM values. It theoretically tries to capture the varibility of the data in non-linear ways""" data, labels = read_psi_and_recover_tissue(n_samples=n_samples, n_variables=10000, data_type=data_type, filter_tissues=filter_tissues) X_train, y_train = generate_sets(data, labels, do_not_split=True) mds = MDS(n_components=n_dimensions, metric=metric, n_init=2, max_iter=1000, verbose=1, eps=0.0001, n_jobs=3, random_state=None, dissimilarity='euclidean') mds.fit(X_train.values) results = mds.embedding_ results = pandas.DataFrame( results, columns=[str(x) + 'D' for x in range(1, n_dimensions + 1)], index=y_train.index) results = pandas.concat([results, y_train.idxmax(1)], axis=1) results = results.rename(columns={0: 'Tissue'}) plot_by_group(results.groupby('Tissue'), '1D', '2D', kind_of_summary='MDS')
def mds_and_plot(model): data = DataSet() x, y, data_list = data.get_test_frames('train') custom_model = Model(inputs=model.input, outputs=model.get_layer('dense_1').output) y_pred = custom_model.predict(x) mds = MDS() mds.fit(y_pred) a = mds.embedding_ mark = ['or', 'ob', 'og', 'oy', 'ok', '+r', 'sr', 'dr', '<r', 'pr'] color = 0 j = 0 for item in y: index = 0 for i in item: if i == 1: break index = index + 1 plt.plot([a[j:j + 1, 0]], [a[j:j + 1, 1]], mark[index], markersize=5) print(index) j += 1 plt.show()
def plot_diseases_or_countries_3d(years=[2000],axis='disease',method='mds',outname='d_clusters_by_c_pattern_mds',data_pd=fdata_pd): # axis is 'disease' or 'country' # years is subset range 1990-2016 # method is 'pca' or 'mds' scaler = StandardScaler() if axis=='disease': year_slices = [scaler.fit_transform(data_pd.loc[(fdata_pd['year'].isin([year])),lambda s: s.columns[2:]].T) for year in years] elif axis=='country': year_slices = [scaler.fit_transform(data_pd.loc[(fdata_pd['year'].isin([year])),lambda s: s.columns[2:]]) for year in years] if method=='mds': red = MDS(n_components=3) elif method=='pca': red = PCA(n_components=3) # fit with full data all_year_slices = np.concatenate([year_slices[i] for i in range(len(year_slices))],axis=0) red.fit(all_year_slices) # transform individuals... could use above, not most efficient, can fix if time is issue. year_slices = [red.fit_transform(item) for item in year_slices] traces = []; for row in year_slices: traces.append([Scatter3d(x=year[:,0],y=year[:,1],z=year[:,2],mode='markers') for year in year_slices]) data = Data(traces) iplot(data, filename = outname)
class Embedder(object): def __init__(self, method_name, *args, **kwargs): self.projector = None self.method_name = method_name if method_name == "tsne": self.projector = TSNE(*args, **kwargs) elif method_name == "pca": self.projector = PCA(*args, **kwargs) elif method_name == "mds": self.projector = MDS(n_jobs=-1, *args, **kwargs) else: logger.error("the projection method is not supported now!!") def fit(self, X, y): t = time() self.projector.fit(X, y) logger.info("{} fit function time cost: {}".format(self.method_name, time()-t)) def transform(self, X, y): t = time() self.projector.transform(X, y) logger.info("{} transform function time cost: {}".format(self.method_name, time()-t)) def fit_transform(self, X, y): t = time() res = self.projector.fit_transform(X, y) logger.info("{} fit_transform function time cost: {}".format(self.method_name, time()-t)) return res
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True): mds = MDS(n_components=n_dims, metric=metric, n_jobs=n_threads, dissimilarity='precomputed') mds.fit(squareform(dists)) projs = mds.embedding_ res = {'stress': mds.stress_, 'projections': projs} return res
def use_mds(self): obj = MDS(self.n_components) obj.fit(self.data) print(obj.fit(self.data)) iris_t2 = obj.fit_transform(self.data) plt.scatter(iris_t2[:, 0], iris_t2[:, 1], c=self.c) plt.title('Using sklearn MDS') plt.show()
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True): mds = MDS(n_components=n_dims, metric=metric, n_jobs=n_threads, dissimilarity='precomputed') mds.fit(squareform(dists)) projs = mds.embedding_ res = {'stress': mds.stress_, 'projections': projs} return res
def md_scaling(co_matrix, is_distance_matrix=False): if not is_distance_matrix: distance_matrix = -np.log(co_matrix.matrix) else: distance_matrix = co_matrix mds = MDS(dissimilarity='precomputed') mds.fit(distance_matrix) return mds.embedding_
def mds_sklearn(A, save_to_file=None): fig, ax = plt.subplots() mds = MDS(2, dissimilarity="precomputed") mds.fit(A) x = mds.embedding_[:, 0] y = mds.embedding_[:, 1] ax.scatter(x, y) if save_to_file is not None: fig.savefig(save_to_file) return fig
def mds(): global df global normalized_df global stratifiedSample global clusterLabels global random_sample global finalSample stressForOriginalData = [] stressForRandomSampleData = [] stressForStratifiedSampleData = [] for k in range(2, 5): md = MDS(n_components=k, dissimilarity='euclidean') components = md.fit(normalized_df) stressForOriginalData.append((k, md.stress_ / 100000)) for k in range(2, 5): md = MDS(n_components=k, dissimilarity='euclidean') components = md.fit(random_sample) stressForRandomSampleData.append((k, md.stress_ / 100000)) for k in range(2, 5): md = MDS(n_components=k, dissimilarity='euclidean') components = md.fit(finalSample) stressForStratifiedSampleData.append((k, md.stress_ / 100000)) originalData = pd.DataFrame(stressForOriginalData, columns=["xval", "yval"]) odata = originalData.to_dict(orient='records') odata = json.dumps(odata, indent=2) randomData = pd.DataFrame(stressForRandomSampleData, columns=["xval", "yval"]) rdata = randomData.to_dict(orient='records') rdata = json.dumps(rdata, indent=2) stratData = pd.DataFrame(stressForStratifiedSampleData, columns=["xval", "yval"]) sdata = stratData.to_dict(orient='records') sdata = json.dumps(sdata, indent=2) columns = json.dumps({"xc": "MDS Components", "yc": "Stress"}) numparams = json.dumps({"np": 6}) data = { 'plot_data': odata, 'rdata': rdata, 'sdata': sdata, 'columns': columns, 'nump': numparams } return render_template("index2.html", data=data)
def ordinate_sklearn( dist, method="mds" ): if method == "mds": Worker = MDS( metric=True, n_components=2, dissimilarity='precomputed', n_init=10, max_iter=1000 ) elif method == "nmds": Worker = MDS( dissimilarity='precomputed', random_state=1701 ) elif method == "tsne": Worker = TSNE( metric='precomputed', perplexity=50 ) Worker.fit( dist ) embedding = Worker.embedding_ # estimate variance explained by each axis varexp = get_varexp( dist, embedding ) # reorder dimensions to match varexp order index = sortedby( range( len( varexp ) ), varexp, reverse=True ) embedding = embedding[:,index] varexp.sort( reverse=True ) return embedding, varexp, get_fit( dist, embedding )
def main(): # load sample data data = np.loadtxt("distmat799.txt", delimiter=",") dists = data / np.amax(data) # load images img_files = [img for img in os.listdir("799_patch") if re.search(r"\.png", img)] # mds mds = MDS(n_components=2, dissimilarity="precomputed") results = mds.fit(dists) # plot fig, ax = plt.subplots() for i, img_file in enumerate(img_files): img_file = os.path.join("799_patch", img_file) img = read_png(img_file) imagebox = OffsetImage(img, zoom=2.0) coords = results.embedding_[i, :] xy = tuple(coords) ab = AnnotationBbox(imagebox, xy) ax.add_artist(ab) ax.set_xlim(-1.0, 1.0) ax.set_ylim(-1.0, 1.0) plt.show()
def mds(rdm): seed = np.random.RandomState(seed=3) mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(rdm.square).embedding_ #nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, # dissimilarity="precomputed", random_state=seed, n_jobs=1, # n_init=1) #npos = nmds.fit_transform(similarities, init=pos) # Rescale the data # if patterns: # pos *= np.sqrt((patterns ** 2).sum()) / np.sqrt((pos ** 2).sum()) #npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum()) # Rotate the data clf = PCA(n_components=2) pos = clf.fit_transform(pos) #npos = clf.fit_transform(npos) return pos
def plot_DF(df, N, metric, annotate=True, clusters=False, sizes=False): if metric: if metric != "cosine": df = df.div(df.sum(axis=0), axis=1) dist = pairwise_distances(df, metric=metric) else: dist = df mds = MDS(dissimilarity="precomputed", n_components=N) pos = mds.fit(dist).embedding_ if N == 1: for x, y in zip(df.index, pos): plt.scatter(x, y) elif N == 2: if clusters: colors = get_colors(clusters) for l, x, y in zip(df.index, pos[:, 0], pos[:, 1]): if sizes: S = sizes[l] else: S = 10 if clusters: plt.scatter(x, y, c=colors[l], s=S) else: plt.scatter(x, y, s=S) if annotate: plt.annotate(l, xy=(x, y)) plt.show()
def plot(self, x): self.ax.clear() mds = MDS(n_components=2, dissimilarity="precomputed") pos = mds.fit(x).embedding_ self.ax.scatter(pos[:, 0], pos[:, 1], color='darkcyan') self.draw()
def plot_mds(self, save_path=None): mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='precomputed', n_jobs=1, random_state=42) pos = mds.fit(1 - self.W).embedding_ plt.figure(figsize=(8, 8)) ax = plt.axes([0., 0., 1., 1.]) for (pos_x, pos_y), cls in zip(pos, self.class_nms): plt.text(pos_x - 0.03, pos_y - 0.03, cls, fontsize=25) segments = [[pos[i, :], pos[j, :]] for i in range(len(pos)) for j in range(len(pos))] lc = LineCollection(segments, zorder=0, cmap=plt.cm.Blues, norm=plt.Normalize(0, 0.5)) lc.set_linewidths(np.full(len(segments), 0.5)) ax.add_collection(lc) plt.scatter(pos[:, 0], pos[:, 1], color='turquoise') plt.axis('off') if save_path != None: plt.savefig(save_path) plt.show() plt.close()
def plotMap(maparr, freq, nest, seqs, dbfile, map2d, outfile, plotm='T'): #mutli-dimensional scaling similarities = euclidean_distances(np.matrix(maparr)) mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3), dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ #plot attributes N = len(pos) #size = [20*n for n in freq] size = 8000 color = np.array(range(N)) if str(plotm) == 'T': #plot MDS fig, ax = plt.subplots(figsize=(10,10)) warnings.filterwarnings("ignore") scatter = ax.scatter(np.array(pos[:,0]), np.array(pos[:,1]), c=color, s=size, alpha=0.3, cmap=plt.cm.viridis, marker='s') plt.xlabel('Dimension 1', fontsize=20, labelpad=20) plt.ylabel('Dimension 2', fontsize=20, labelpad=20) #plt.axis([xmin, xmax, ymin, ymax]) plt.tick_params(labelsize=15, length=14, direction='out', pad=15, top='off', right='off') #save figures fig.savefig(outfile + '.png', bbox_inches='tight', format='png') fig.savefig(outfile + '.pdf', bbox_inches='tight', format='pdf') plt.close(fig) warnings.resetwarnings() #write csv file writePlotMDS(freq, nest, seqs, dbfile, pos, maparr, map2d, outfile) return pos
def project_in_2D(distance_mat, method='mds'): """ Project SDRs onto a 2D space using manifold learning algorithms :param distance_mat: A square matrix with pairwise distances :param method: Select method from 'mds' and 'tSNE' :return: an array with dimension (numSDRs, 2). It contains the 2D projections of each SDR """ seed = np.random.RandomState(seed=3) if method == 'mds': mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(distance_mat).embedding_ nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit_transform(distance_mat, init=pos) elif method == 'tSNE': tsne = TSNE(n_components=2, init='pca', random_state=0) pos = tsne.fit_transform(distance_mat) else: raise NotImplementedError return pos
def main(): # load sample data data = np.loadtxt('distmat799.txt', delimiter=',') dists = data / np.amax(data) # load images img_files = [ img for img in os.listdir('799_patch') if re.search(r'\.png', img) ] # mds mds = MDS(n_components=2, dissimilarity='precomputed') results = mds.fit(dists) # plot fig, ax = plt.subplots() for i, img_file in enumerate(img_files): img_file = os.path.join('799_patch', img_file) img = read_png(img_file) imagebox = OffsetImage(img, zoom=2.0) coords = results.embedding_[i, :] xy = tuple(coords) ab = AnnotationBbox(imagebox, xy) ax.add_artist(ab) ax.set_xlim(-1.0, 1.0) ax.set_ylim(-1.0, 1.0) plt.show()
def plot_clusters(scaled_features, cluster_obj): labels = cluster_obj.labels_ clusters = len(labels) norm = Normalize(min(labels), max(labels)) cm = mpl.cm.jet mds = MDS(n_components=2) res = mds.fit(scaled_features) pos = res.embedding_ offset_radius = 10 cluster_thetas = np.linspace(0, 2 * np.pi, clusters + 1)[0:clusters] cluster_vectors = [(offset_radius * np.cos(theta), offset_radius * np.sin(theta)) for theta in cluster_thetas] for i, coords in enumerate(pos): label = labels[i] color = cm(norm(label)) offset = cluster_vectors[label] mpl.plot(coords[0] + offset[0], coords[1] + offset[1], color=color, marker='o') mpl.show()
def labtest_MDS(PID): data = [patients[pid]['tests'] for pid in PID] X = pp.scale(data) mds = MDS(n_components = 2, metric = True, n_init = 4, max_iter = 300, verbose = 0, eps = 0.001, n_jobs = 1, dissimilarity = 'euclidean') pos = mds.fit(X).embedding_ return pos
def w2c_mds_dec(data, dim=2): mds = MDS(n_components=dim, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity='euclidean', n_jobs=1) return mds.fit(data).embedding_
def cluster(D, k=3, verbose=False): """Cluster LDS's via Multi-Dimensional Scaling and KMeans. Strategy: 1. Build NxN matrix of pairwise similarities 2. Run MDS to embed data in R^2 3. Run KMeans with k cluster centers 4. Find samples closest to the k centers Paramters: ---------- D: numpy.ndarray, shape = (N, N) Precomputed distance matrix. k: int (default: 3) Number of desired cluster centers. verbose: boolean Enable verbose output. Returns: -------- eData: numpy.ndarray, shape (N, k) N d-dimensional samples embedded in R^d. ids: numpy.ndarray, shape = (k,) List of indices identifying the k representatives. """ assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!" # build MDS for precomputed similarity matrix mds = MDS(metric=True, n_components=2, verbose=True, dissimilarity="precomputed") def __symmetrize(A): return A + A.T - np.diag(A.diagonal()) # run MDS on symmetrized similarity matrix eData = mds.fit(__symmetrize(D)).embedding_ kmObj = KMeans(k) kmObj.fit_predict(eData) ids = np.zeros((k, ), dtype=np.int) for i in range(k): # sanity check cDat = eData[np.where(kmObj.labels_ == i)[0], :] assert len(cDat) > 0, "Oops, empty cluster ..." kCen = kmObj.cluster_centers_[i, :] x = euclidean_distances(eData, kCen) ids[i] = int(np.argsort(x.ravel())[0]) # return distance matrix and ID's of representative LDS's return (eData, ids)
def get_training_set_2d_coordinates(distance_matrix, labels, random_state=None): """ Other approach: t-SNE? """ training_coordinates = MDS(n_components=2, random_state=random_state, dissimilarity='precomputed') training_coordinates.fit(distance_matrix) df = pd.DataFrame( dict(x=training_coordinates.embedding_[:, 0], y=training_coordinates.embedding_[:, 1], label=labels)) # note: the stress_ is the sum of squared distance of the # disparities and the distances for all constrained points return df, training_coordinates.stress_
def mds(df, value='Data Value', n_dimension=2): tmp = pd.merge(df, jobzones_23, on=['O*NET-SOC Code']) #examine the level or importance temp = tmp[tmp['Scale ID'] == 'IM'] temp = temp.pivot_table(index=['O*NET-SOC Code', 'Job Zone'], columns='Element Name', values=value).reset_index() columns = temp.columns.tolist() features = [ str(col) for col in columns if col not in ['Title', 'O*NET-SOC Code', 'Job Zone'] ] x = temp.loc[:, features].values x = StandardScaler().fit_transform(x) #get the distance between jobs t = np.dot(x, np.transpose(x)) mds = MDS(n_components=n_dimension, max_iter=3000, eps=1e-9, random_state=12345, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(t).embedding_ # select the top 2 dimensions of data clf = PCA(n_components=2) pos = clf.fit_transform(pos) finalDf = pd.concat( [pd.DataFrame(pos), temp[['O*NET-SOC Code', 'Job Zone']]], axis=1) finalDf.rename(columns={0: 'PC1', 1: 'PC2'}, inplace=True) #plot the graphs fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) s = 100 #plt.scatter(pos[:, 0], pos[:, 1], color='blue', s=s, lw=0) ax.scatter(finalDf['PC1'], finalDf['PC2'], color='blue', s=50) finalDf = finalDf.reindex(finalDf.PC1.abs().sort_values().index) largest_gap = finalDf.tail(20) #for i, txt in enumerate(largest_gap['Title']): # ax.annotate(txt, (largest_gap[largest_gap['Title']==txt]['PC1'], largest_gap[largest_gap['Title']==txt]['PC2'])) plt.ylim(-8, 8) plt.xlim(-30, 30) plt.title('Polarization measured by Distance between Jobs') temp = tmp[tmp['Scale ID'] == 'IM'] temp2 = temp.pivot_table(index=['O*NET-SOC Code'], columns='Element Name', values='Data Value').reset_index() df = pd.merge(temp2, finalDf, on=['O*NET-SOC Code']) df = df.corr() #df.rename(columns={0: 'principal component 1', 1: 'principal component 2'}, inplace=True) #df.columns = ['principal component 1', 'principal component 2','Title','O*NET-SOC Code','Job Zone' ] print df[['PC1']].sort_values('PC1', ascending=False).head(6) print df[['PC2']].sort_values('PC2', ascending=False).head(6)
def cluster(D, k=3, verbose=False): """Cluster LDS's via Multi-Dimensional Scaling and KMeans. Strategy: 1. Build NxN matrix of pairwise similarities 2. Run MDS to embed data in R^2 3. Run KMeans with k cluster centers 4. Find samples closest to the k centers Paramters: ---------- D: numpy.ndarray, shape = (N, N) Precomputed distance matrix. k: int (default: 3) Number of desired cluster centers. verbose: boolean Enable verbose output. Returns: -------- eData: numpy.ndarray, shape (N, k) N d-dimensional samples embedded in R^d. ids: numpy.ndarray, shape = (k,) List of indices identifying the k representatives. """ assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!" # build MDS for precomputed similarity matrix mds = MDS(metric=True, n_components=2, verbose=True, dissimilarity="precomputed") def __symmetrize(A): return A + A.T - np.diag(A.diagonal()) # run MDS on symmetrized similarity matrix eData = mds.fit(__symmetrize(D)).embedding_ kmObj = KMeans(k) kmObj.fit_predict(eData) ids = np.zeros((k,), dtype=np.int) for i in range(k): # sanity check cDat = eData[np.where(kmObj.labels_ == i)[0],:] assert len(cDat) > 0, "Oops, empty cluster ..." kCen = kmObj.cluster_centers_[i,:] x = euclidean_distances(eData, kCen) ids[i] = int(np.argsort(x.ravel())[0]) # return distance matrix and ID's of representative LDS's return (eData, ids)
def nmds_function(matrix, dimensions): nmds = MDS(n_components=dimensions, metric=False, dissimilarity='precomputed', max_iter=int(max_iter_val), n_init=int(n_init_val)) nmds_results = nmds.fit(jaccard_dm[:100]) stress = round(nmds_results.stress_, 2) nmds_array = nmds_results.embedding_ return ({"stress": stress, "nmds_results": nmds_array})
def timeline_scatter_plot(X, time_index, method='MDS', metric='cosine', **kwargs): if not isinstance(time_index, pd.DatetimeIndex): time_index = pd.DatetimeIndex(time_index) dm = pairwise_distances(X, metric=metric) if method.upper() == 'MDS': decomposer = MDS(n_components=2, dissimilarity='precomputed', verbose=1, **kwargs) decomposer.fit(dm) elif method.upper() == 'TSNE': decomposer = TSNE(n_components=2, metric='precomputed', verbose=1, **kwargs) decomposer.fit(dm) else: raise ValueError("Method %s is not supported..." % method) X, Y = decomposer.embedding_[:,0], decomposer.embedding_[:,1] unique_index = time_index.unique().order() colormap = {time_stamp: color for time_stamp, color in zip( unique_index, sns.cubehelix_palette(unique_index.shape[0]))} colors = [colormap[time_stamp] for time_stamp in time_index] sns.plt.scatter(X, Y, s=40, color=colors, alpha=0.7) sns.plt.axis('off')
def get_mds(similarities): seed = np.random.RandomState(seed=3) print(np.amax(similarities)) print(np.amin(similarities)) nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit(similarities).embedding_ X=np.array(pos) return X
def embedDistanceMatrix(dmatDf, method='kpca', n_components=2, **kwargs): """Two-dimensional embedding of sequence distances in dmatDf, returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca, sklearn-tsne""" if isinstance(dmatDf, pd.DataFrame): dmat = dmatDf.values else: dmat = dmatDf if method == 'tsne': xy = tsne.run_tsne(dmat, no_dims=n_components, perplexity=kwargs['perplexity']) elif method == 'isomap': isoObj = Isomap(n_neighbors=10, n_components=n_components) xy = isoObj.fit_transform(dmat) elif method == 'mds': mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=15, dissimilarity="precomputed", n_jobs=1) xy = mds.fit(dmat).embedding_ rot = PCA(n_components=n_components) xy = rot.fit_transform(xy) elif method == 'pca': pcaObj = PCA(n_components=None) xy = pcaObj.fit_transform(dmat)[:, :n_components] elif method == 'kpca': pcaObj = KernelPCA(n_components=dmat.shape[0], kernel='precomputed', eigen_solver='dense') try: gram = dist2kernel(dmat) except: print('Could not convert dmat to kernel for KernelPCA; using 1 - dmat/dmat.max() instead') gram = 1 - dmat / dmat.max() xy = pcaObj.fit_transform(gram)[:, :n_components] elif method == 'lle': lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=n_components, method='standard') xy = lle.fit_transform(dist) elif method == 'sklearn-tsne': tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0, perplexity=kwargs['perplexity']) xy = tsneObj.fit_transform(dmat) elif method == 'umap': umapObj = umap.UMAP(n_components=n_components, metric='precomputed', **kwargs) xy = umapObj.fit_transform(dmat) else: print('Method unknown: %s' % method) return assert xy.shape[0] == dmatDf.shape[0] xyDf = pd.DataFrame(xy[:, :n_components], index=dmatDf.index, columns=np.arange(n_components)) if method == 'kpca': """Not sure how negative eigenvalues should be handled here, but they are usually small so it shouldn't make a big difference""" setattr(xyDf, 'explained_variance_', pcaObj.lambdas_[:n_components]/pcaObj.lambdas_[pcaObj.lambdas_>0].sum()) return xyDf
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True): max_iter = 10000 if not get_setting("DEBUG") else 100 if not init_from_isomap: warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!") n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, #TODO with metric=True it always breaks after the second step if n_components>>2 (well, mit metric=False auch^^) n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter) mds = embedding.fit(dissim_mat) else: print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter) try: isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_ except ValueError: #There are significant negative eigenvalues... isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01 mds = embedding.fit(dissim_mat, init=isomap_init) return mds
def mds_util(k, metric): mds = MDS(n_components=k, metric=metric,\ max_iter=1000, eps=1e-9, dissimilarity="precomputed",\ n_jobs=1, random_state=3) mds_fit_out = mds.fit(cars_od) return { 'stress': mds_fit_out.stress_, 'embedding': mds_fit_out.embedding_ }
class MDS_Reducer(Reducer): '''The multidimensional scaling (MDS) reduction method''' def __init__(self, dimensionality=2500, seed=None): rnd_state = np.random.RandomState(seed=seed) self.mds = MDS(n_components=dimensionality, n_jobs=-1, random_state=rnd_state, dissimilarity="precomputed") def reduced(self, A): embd = self.mds.fit(A).embedding_ return np.transpose(embd)
def make_mds_image(m, filename, labels=None, colour=None): """Given a matrix of distances, project into 2D space using multi-dimensional scaling and produce an image.""" mds_data_filename = filename + ".dat" try: # if we've previously computed, load it p = np.genfromtxt(mds_data_filename) except: # else, compute it now (and save) # Construct MDS object with various defaults including 2d mds = MDS(dissimilarity="precomputed") # Fit try: f = mds.fit(m) except ValueError as e: print("Can't run MDS for " + filename + ": " + str(e)) return # Get the embedding in 2d space p = f.embedding_ # save np.savetxt(mds_data_filename, p) # Make an image fig, ax = plt.subplots(figsize=(5, 5)) # x- and y-coordinates ax.set_aspect('equal') ax.scatter(p[:, 0], p[:, 1], edgecolors='none') if labels != None: print filename # hard-coded for GP depth-2 indices = [0, 2, 50, 52] for i in indices: print labels[i], p[i, 0], p[i, 1] # can print some labels directly on the graph as follows, # but maybe it's better done manually, after printing # their locations to terminal? # plt.text(p[i,0], p[i,1], labels[i], style='italic', # bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) fig.savefig(filename + ".pdf") fig.savefig(filename + ".eps") fig.savefig(filename + ".png") plt.close(fig)
def make_mds_image(m, filename, labels=None, colour=None): """Given a matrix of distances, project into 2D space using multi-dimensional scaling and produce an image.""" mds_data_filename = filename + ".dat" try: # if we've previously computed, load it p = np.genfromtxt(mds_data_filename) except: # else, compute it now (and save) # Construct MDS object with various defaults including 2d mds = MDS(dissimilarity="precomputed") # Fit try: f = mds.fit(m) except ValueError as e: print("Can't run MDS for " + filename + ": " + str(e)) return # Get the embedding in 2d space p = f.embedding_ # save np.savetxt(mds_data_filename, p) # Make an image fig, ax = plt.subplots(figsize=(5, 5)) # x- and y-coordinates ax.set_aspect('equal') ax.scatter(p[:,0], p[:,1], edgecolors='none') if labels != None: print filename # hard-coded for GP depth-2 indices = [0, 2, 50, 52] for i in indices: print labels[i], p[i,0], p[i,1] # can print some labels directly on the graph as follows, # but maybe it's better done manually, after printing # their locations to terminal? # plt.text(p[i,0], p[i,1], labels[i], style='italic', # bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) fig.savefig(filename + ".pdf") fig.savefig(filename + ".eps") fig.savefig(filename + ".png") plt.close(fig)
def plot_mds(points, genres, n_points=500): ''' Plots a set of documents in MDS space Args: points: dense array with coordinates of each document genres: list of genres for each entry in points Returns: None ''' genres = np.array(genres) genre_sel = np.not_equal(genres, None) X, y = points[genre_sel], genres[genre_sel] X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, train_size=n_points) distances = cosine_distances(X_train, X_train) mds = MDS(n_components=2, dissimilarity='precomputed') mds.fit(distances) plot_embedding(mds.embedding_, y_train)
def calc_MDS_corr(): mds_corr = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) # print(data.corr()) similarities_corr = np.array(1-abs(data.corr())) pos_corr = mds_corr.fit(similarities_corr).embedding_ pos_corr_df = pd.DataFrame.from_records(pos_corr, columns=['x','y']) pos_corr_df["labels"] = list(data.columns) short_names = ['DT', 'CRS_DT', 'AT', 'CRS_AT', 'F_No', 'ActET', 'CRS_ET', 'ArrD', 'DepD', 'Dis'] pos_corr_df['short_names'] = short_names return json.dumps(pos_corr_df.to_dict(orient="records"))
def reduction(simMat,N=2): #change similarity matrix into dissimilarity matrix dis = map(lambda x: map(lambda y: 1-y, x), simMat) #dis = dist(simMat) #dis = simMat #configure MDS to run 10 times. Also specify that data will be a dissimilarity matrix mds = MDS(n_components=N, n_init=10,max_iter=3000, metric=True, dissimilarity="precomputed") mat = np.array(dis) #Run MDS fit = mds.fit(mat) print "Approximate Stress:", fit.stress_ print "Stress:", stress(dis, fit.embedding_) return fit.embedding_
def mult_scl(X, labels): print('labels:') for i, label in zip(range(1, len(labels) + 1), labels): print('{}: {}'.format(i, label)) isomap = Isomap() points = isomap.fit(np.nan_to_num(X)).embedding_ f, (ax1, ax2, ax3) = plt.subplots(1, 3) plot_location(labels, ax3) ax1.scatter(points[:, 0], points[:, 1], s=20, c='r') ax1.set_title('Isomap') add_labels(labels, points, ax1) mds = MDS() points = mds.fit(np.nan_to_num(X)).embedding_ ax2.scatter(points[:, 0], points[:, 1], s=20, c='g') ax2.set_title('MDS') add_labels(labels, points, ax2) plt.show()
def plot_clusters(scaled_features, cluster_obj): labels = cluster_obj.labels_ clusters = len(labels) norm = Normalize(min(labels), max(labels)) cm = mpl.cm.jet mds = MDS(n_components=2) res = mds.fit(scaled_features) pos = res.embedding_ offset_radius = 10 cluster_thetas = np.linspace(0, 2 * np.pi, clusters + 1)[0:clusters] cluster_vectors = [(offset_radius * np.cos(theta), offset_radius * np.sin(theta)) for theta in cluster_thetas] for i, coords in enumerate(pos): label = labels[i] color = cm(norm(label)) offset = cluster_vectors[label] mpl.plot(coords[0] + offset[0], coords[1] + offset[1], color=color, marker='o') mpl.show()
def clustered_mds(cds, clusters=None, filename=None): num_subj = cds.shape[0] num_voxels = cds.shape[1] clusters = cds.a.event_bounds num_clusters = len(clusters) ds_list = np.zeros((num_subj, num_voxels, num_clusters-1)) prev_cutoff = 0 ds_tup = () # average correlations for each scene for i in range(num_clusters - 1): ds_list[:,:,i] = np.mean(cds.samples[:,:,clusters[i]:clusters[i+1]], axis=2) dsm_array = [] for subj in ds_list: dsm_array.append(squareform(1 - pdist(subj.T, metric='correlation'))) dsm = np.mean(dsm_array, axis=0) mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) coords = mds.fit(dsm).embedding_ plt.clf() X, Y = coords[:,0], coords[:,1] labels = np.arange(1,num_clusters) fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111) plt.scatter(X,Y, marker='x') for i, label in enumerate(np.arange(1,num_clusters)): ax.annotate(label, (X[i],Y[i])) plt.axis([np.min(X)*1.2, np.max(X)*1.2, np.min(Y)*1.2, np.max(Y)*1.2]) plt.title("MDS Scene Visualization") plt.show() return dsm
def multidimensional_scaling(rdm, labels): # perform multidimensional scaling mds = MDS( n_components=2, max_iter=3000, dissimilarity='precomputed' ) positions = mds.fit(rdm).embedding_ positions /= positions.max() # visualize the embedding in a figure figure = plt.figure(1) ax = plt.axes([0., 0., 1., 1.]) plt.scatter(positions[:, 0], positions[:, 1]) # plot the edges segments = [[positions[i, :], positions[j, :]] for i in range(len(positions)) for j in range(len(positions))] values = np.abs(rdm) lc = LineCollection( segments, zorder=0, cmap=plt.cm.YlGnBu, norm=plt.Normalize(0, values.max()) ) lc.set_array(rdm.flatten()) lc.set_linewidths(2 * np.ones(len(segments))) ax.add_collection(lc) # add labels for index, label in enumerate(labels): plt.annotate(label, (positions[index, 0], positions[index, 1])) plt.show()
def calculate_and_cluster(): # Variables for storing the data data_list = {} tag_list = {} tag_map = {} data_tag_map = {} counter = 0 index = 0 ptr = "" # Parse the CSV file (this will be denoted by a string variable) with open('../../data/sets/complete_set.csv','rb') as csvfile: reader = csv.reader(csvfile,delimiter=',') for row in reader: data_list[counter] = ''.join(row) counter +=1 counter = 0 # Loop through data in range for data in range(0,len(data_list)): # Split the last token in the string split = data_list[data].split(" ")[-1:] # print split[0], "Tag set: ", get_tag_set(split[0]) data_tag_map[split[0]] = get_tag_set(split[0]) od = OrderedDict(sorted(data_tag_map.items())) names = [] data_tagged_list = {} counter = 0 for key, value in od.iteritems(): # Maintain old file name file_old = str(counter) + '.txt' tag = '' if len(value) == 1: tag = 'Tagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = True else: tag = 'Untagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = False # Create new file name with tagged / untagged appended file_new = str(counter) + '_' + tag + '.txt' # Rename the file for later use in color co-ordination rename_file(file_old,file_new) counter += 1 dataNodes = [] for x in range(0,len(data_list)): dataNodes.append(data_list[x]) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(dataNodes) X = genfromtxt('../semantic_similarity_algorithms/semantic_similarity_matrix/matrix.csv', delimiter=',') X = symmetrize(X) print (X.transpose() == X).all() # N Components: plotting points in a two-dimensional plane # Dissimilirity: "precomputed" because of the Distance Matrix # Random state is fixed so we can reproduce the plot. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) mds.fit(X.astype(np.float64)) pos = mds.fit_transform(X) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # Set figure size to have dimensions of at least 15 inches for the width. # Height can be scaled accordingly. plt.figure(figsize=(15,8)) plt.subplot(211) # Loop through the points, label approriately and scatter # Ensure figure size has enough room for legend plotting. Each plot must have a label. # In this case, label is the split value denoting the POI tag for x, y, name in zip(xs, ys, names): plt.scatter(x, y, s=100,c=get_colour_tag(name.split('_',1)[1]), label = name.split('_',1)[1]) #plt.text(x,y,name.split('_',1)[0]) handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) legend = plt.legend(by_label.values(), by_label.keys(),loc='lower center',ncol=4,bbox_to_anchor=(0.5, -0.6)) plt.show()
for face in [0, 1, 2]: csvs = stats_dict[stat]["Fiducials"][face] for (i, j) in combinations(fiducials, 2): dist1 = read_csv(csvs[i], index_col=0).values dist2 = read_csv(csvs[j], index_col=0).values # Symmeterize dist1 += dist1.T dist2 += dist2.T # Run MDS and map to lower dim. Try 2 for visualizing. mds_1 = MDS( n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1 ) pos_1 = mds_1.fit(dist1).embedding_ mds_2 = MDS( n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1 ) pos_2 = mds_2.fit(dist2).embedding_ output = procrustes_analysis(pos_1, pos_2, nperm=10) proc_results[:, face, i, j] = output proc_fiducials[stat][face] = {"value": proc_results[0, face, :, :], "pvals": proc_results[1, face, :, :]} # Now compare fiducials to designs. csv_designs = stats_dict[stat]["Designs"][face]
'Laboratorium fizyki 2': 'Physics Laboratory 2', 'Analiza matematyczna 1': 'Mathematical Analysis 1', 'Mechanika': 'Mechanics' } cl = 'L' co_corr = np.corrcoef(win.getData(class_=cl), rowvar=0) labels = [pl_en[x] for x in win.getCoursesNames()] mds = MDS(n_components=2, dissimilarity='precomputed') dists = np.empty((len(co_corr), len(co_corr))) for ii in range(len(labels)): for jj in range(len(labels)): dists[ii][jj] = math.sqrt(2 * (1 - co_corr[ii][jj])) pos = mds.fit(dists).embedding_ G = nx.Graph() G.add_nodes_from(range(len(labels))) textstr = "" for ii, l in enumerate(labels): textstr += str(ii) + " - " + l + "\n" for jj in range(ii + 1, len(labels)): d = dists[ii][jj] G.add_edge(ii, jj, weight=d) si = [] for n, nbrs in G.adjacency_iter(): w = 0 for nbr, eattr in nbrs.items(): w += 1 / eattr['weight']
def getMDS(self, featureMatrix, dist=None): if dist is None: dist = 1-cosine_similarity(featureMatrix) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=6) results = mds.fit(dist) return results.embedding_
# CLUSTERING # Create KMeans kmeans = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) # Train KMeans kmeans.fit(data) # Get the results kmeans_labels = kmeans.labels_ kmeans_cluster_centers = kmeans.cluster_centers_ kmeans_labels_unique = np.unique(kmeans_labels) #################################### # PLOT PREPARATION # Reduce to two dimensions for plotting mds = MDS(n_components=2) mds.fit(data) scaled_coordinates = mds.embedding_ # PLOT ON TWO DIMENSIONS labelled_data_x = (dict(), dict()) labelled_data_y = (dict(), dict()) for label in kmeans_labels_unique: labelled_data_x[0][label] = [] labelled_data_y[0][label] = [] labelled_data_x[1][label] = [] labelled_data_y[1][label] = [] for i in range(0, len(names)): label = kmeans_labels[i] labelled_data_x[survived[i]][label].append(scaled_coordinates[i][0]) labelled_data_y[survived[i]][label].append(scaled_coordinates[i][1])
######################## dimensions = np.arange(2, 30, 1) stress_vector = np.zeros_like(dimensions) for i, dim in enumerate(dimensions): # Define classifier n_comp = dim max_iter = 1000 eps = 1e-9 mds = MDS(n_components=n_comp, max_iter=max_iter, eps=eps, n_jobs=2, dissimilarity='precomputed') x = mds.fit(distances) stress = x.stress_ / distances.shape[0] print 'Dimension', dim print 'The stress is', stress stress_vector[i] = stress ######################## # Plot Here ######################## # Plot parameters fontsize = 20 figsize = (16, 12) axes_position = [0.1, 0.1, 0.8, 0.8] title = 'Stress vs size of embedding space' xlabel = 'Dimension'
def __plot_samples__(self, dfs, fold): """ :type dfs: List[pandas DataFrame] # [training df, testing df] :type fold: int :rtype: None """ mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='euclidean', n_jobs=-1) tsne = TSNE(n_components=2) # change label to color index # author 1 train (0 = light blue), author 1 test (1 = dark blue) # author 2 train (2 = light green), author 2 test (3 = dark green) df_all = pd.DataFrame(columns = dfs[0].columns) df0_copy = dfs[0].copy() df0_copy.loc[(df0_copy.label == 1).values, 'label'] = 0 df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2 df_all = df_all.append(df0_copy) df1_copy = dfs[1].copy() df1_copy.loc[(df1_copy.label == 1).values, 'label'] = 1 df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3 df_all = df_all.append(df1_copy) legend = {0: 'Author 1 Training Sample', 1: 'Author 1 Test Sample', 2: 'Author 2 Training Sample' , 3: 'Author 2 Test Sample' } # fit on training data pos_lst = [('Multi-Dimensional Scaling (MDS)', mds.fit(df_all.drop('label', axis=1)).embedding_), ('t-Distributed Stochastic Neighbor Embedding (TSNE)', tsne.fit(df_all.drop('label', axis=1)).embedding_)] # plot colors = sns.color_palette('Paired', 4) fig = plt.figure(figsize=(16,7)) plt.hold(True) for k, (title, pos) in enumerate(pos_lst, 1): ## fig.add_subplot() works in ipython notebook but creates a ## mysterious 3rd axes in python... # ax = fig.add_subplot(1,2,k) ax = plt.subplot(1,2,k) ax.set_title(title) for i in xrange(len(colors)): samples = pos[(df_all.label == i).values, :] ax.scatter(samples[:,0], samples[:,1], c=colors[i], edgecolor='none', label=legend[i]) ax.legend() plt.hold(False) plt.savefig('../figs/' + \ self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \ 'fold' + str(fold) + '.png', dpi=300, transparent=True) plt.close(fig)
def view_2d_embedding(self, reference=None): # http://baoilleach.blogspot.co.at/2014/01/convert-distance-matrix-to-2d.html if reference is None: # First cluster all structures based on pairwise RMSD db = self._cluster_dbscan() labels = db.labels_ core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True unique_labels = set(labels) # Then calculate the 2D coordinates for our embedding mds = MDS(n_components=2, dissimilarity="precomputed", random_state=6) results = mds.fit(self._rmsd) coords = results.embedding_ # Now plot plt.plot(coords[:, 0], coords[:, 1], '-', color="blue") colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) plt.plot(coords[:, 0][class_member_mask & core_samples_mask], coords[:, 1][class_member_mask & core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6 ) plt.plot(coords[:, 0][class_member_mask & ~core_samples_mask], coords[:, 1][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor=col, markersize=1 ) plt.savefig("embedding_{}.svg".format(self._cgs[0].name)) plt.clf() plt.close() else: # Create a huge distance matrix alldists = np.zeros( ((len(self._cgs) + len(reference) + 1), (len(self._cgs) + len(reference) + 1))) for i, j in it.combinations(range(len(alldists)), 2): if i < len(self._cgs): cg1 = self._cgs[i] elif i < len(self._cgs) + len(reference): cg1 = reference[i - len(self._cgs)] else: assert i == len(self._cgs) + len(reference) cg1 = self._reference_cg if j < len(self._cgs): cg2 = self._cgs[j] elif j < len(self._cgs) + len(reference): cg2 = reference[j - len(self._cgs)] else: assert j == len(self._cgs) + len(reference) cg2 = self._reference_cg alldists[i, j] = alldists[j, i] = ftms.cg_rmsd(cg1, cg2) # Then calculate the 2D coordinates for our embedding mds = MDS(n_components=2, dissimilarity="precomputed", random_state=6) results = mds.fit(alldists) coords = results.embedding_ # Now plot plt.plot(coords[len(self._cgs):len(self._cgs) + len(reference), 0], coords[len(self._cgs):len(self._cgs) + len(reference), 1], 's', color="green") plt.plot(coords[:len(self._cgs), 0], coords[:len(self._cgs), 1], '-o', color="blue") plt.plot([coords[-1, 0]], [coords[-1, 1]], 's', color="red") plt.savefig("embedding1_{}.svg".format(self._cgs[0].name)) plt.clf() plt.close()
def calculate_and_cluster(): global names global data_list global data_tag_map global matrix_list global data_tagged_list data_list = {} data_tag_map = {} data_tagged_list = {} matrix_list = [] counter = 0 # Parse the CSV file (this will be denoted by a string variable) with open('../../../data/sets/complete_set.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: data_list[counter] = ''.join(row) counter += 1 # Loop through data in range for data in range(0, len(data_list)): # Split the last token in the string split = data_list[data].split(" ")[-1:] # print split[0], "Tag set: ", get_tag_set(split[0]) data_tag_map[split[0]] = get_tag_set(split[0]) od = OrderedDict(sorted(data_tag_map.items())) names = [] counter = 0 for key, value in od.iteritems(): # Maintain old file name file_old = str(counter) + '.txt' tag = '' if len(value) == 1: tag = 'Tagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = True else: tag = 'Untagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = False # Create new file name with tagged / untagged appended file_new = str(counter) + '_' + tag + '.txt' # Rename the file for later use in color co-ordination rename_file(file_old, file_new) counter += 1 dataNodes = [] for x in range(0, len(data_list)): dataNodes.append(data_list[x]) # Generate matrix from file X = genfromtxt('matrix.csv', delimiter=',') # Symmetrize X to ensure the matrix is valid X = symmetrize(X) # Put matrix in a list for checking matrix_list = X.tolist() for x in range(0,len(matrix_list)): tagged = get_tagged(str(x)) if(not tagged): tag_nearest_neighbour(x) # Check symmetry print "Symmetric? " + str((X.transpose() == X).all()) # N Components: plotting points in a two-dimensional plane # Dissimilirity: "precomputed" because of the Distance Matrix # Random state is fixed so we can reproduce the plot. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) mds.fit(X.astype(np.float64)) pos = mds.fit_transform(X) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # Set figure size to have dimensions of at least 15 inches for the width. # Height can be scaled accordingly. plt.figure(figsize=(15, 8)) plt.subplot(211) # Loop through the points, label appropriately and scatter # Ensure figure size has enough room for legend plotting. Each plot must have a label. # In this case, label is the split value denoting the POI tag for x, y, name in zip(xs, ys, names): plt.scatter(x, y, s=100, c=get_colour_tag(name.split('_', 1)[1]), label=name.split('_', 1)[1]) #plt.text(x,y,name.split('_',1)[0]) handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys(), loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.6)) plt.show() # Create a denodrogram linkage_matrix = ward(X) # match dendrogram to that returned by R's hclust() dendrogram(linkage_matrix, orientation="right") plt.tight_layout() plt.show()
def vis_MDS(self, dmatrix, ven_names, fout=None): """ Displays MDS graph of venues. :param dmatrix: distance matrix :type dmatrix: numpy.ndarray :param ven_names: names of venues in matrix :type ven_names: list :param fout: save graph to file :type fout: str :return: None :rtype: None """ # setup plot figure with plt.style.context('ggplot'): fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) ax.grid(color='white', linestyle='solid', linewidth=2) fig = plt.gcf() fig.set_dpi(100) fig.set_size_inches((8.0, 8.0), forward=True) plt.subplots_adjust(left=0.10, bottom=0.10, right=0.95, top=0.95) plt.gca().grid(True) plt.axis([-1.0, 1.0, -1.0, 1.0]) ttl = plt.title('MDS: top 30 Venues') # get MDS coordinates for venues myMDS = MDS(2, verbose=0, n_jobs=-1, dissimilarity='precomputed') myMDS.fit(dmatrix) points = myMDS.embedding_ # add column to points to hold venue categories points = np.c_[points, np.zeros(len(points))] # create high-level categories and manually categorize top 30 venues # TODO: use Foursquare's categories json to get higher-level categories for venues CONVENTIONS = 2 THEME_PARKS = 4 STADIUMS = 5 AIRPORTS = 9 airports = [0, 1, 5, 6, 11, 12, 28] conventions = [15, 18, 19, 29] theme_parks = [2, 8, 17] stadiums = [3, 4, 9, 10, 14, 24] others = [7, 13, 16, 20, 21, 22, 23, 25, 26, 27] for ind in airports: points[ind, 2] = AIRPORTS for ind in conventions: points[ind, 2] = CONVENTIONS for ind in theme_parks: points[ind, 2] = THEME_PARKS for ind in stadiums: points[ind, 2] = STADIUMS colors = pd.tools.plotting._get_standard_colors(5, color_type='random') airports_pts = np.stack([points[ind] for ind in airports]) conventions_pts = np.stack([points[ind] for ind in conventions]) theme_parks_pts = np.stack([points[ind] for ind in theme_parks]) stadiums_pts = np.stack([points[ind] for ind in stadiums]) others_pts = np.stack([points[ind] for ind in others]) air = plt.scatter(airports_pts[:, 0], airports_pts[:, 1], marker='o', color=colors[0], s=70, edgecolor='black', linewidth=0.5) con = plt.scatter(conventions_pts[:, 0], conventions_pts[:, 1], marker='o', color=colors[1], s=70, edgecolor='black', linewidth=0.5) theme = plt.scatter(theme_parks_pts[:, 0], theme_parks_pts[:, 1], marker='o', color=colors[2], s=70, edgecolor='black', linewidth=0.5) sta = plt.scatter(stadiums_pts[:, 0], stadiums_pts[:, 1], marker='o', color=colors[3], s=70, edgecolor='black', linewidth=0.5) oth = plt.scatter(others_pts[:, 0], others_pts[:, 1], marker='o', color=colors[4], s=70, edgecolor='black', linewidth=0.5) # make legend legend = plt.legend((air, con, theme, sta, oth), ('Airports', 'Conventions', 'Theme Parks', 'Stadiums', 'Other'), scatterpoints=1, loc='lower left', ncol=2, fontsize=8) frame = legend.get_frame() frame.set_facecolor('#cccccc') frame.set_edgecolor('#909090') # make labels as annotations for label, x, y in zip(ven_names, points[:, 0], points[:, 1]): plt.annotate( label, xy=(x, y), xytext=(0, 5), textcoords='offset points', ha='center', va='bottom', size='xx-small') # adjust tick labels plt.tick_params(axis='both', which='major', labelsize=6, color='gray') plt.tick_params(axis='both', which='minor', labelsize=6, color='gray') # turn off ticks ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False if fout is None: plt.show() else: fig.savefig(fout)
def vis_super_MDS(self, dmatrix, fout=None): """ Displays MDS graph of lots of venues (1200) :param dmatrix: distance matrix :type dmatrix: numpy.ndarray :param fout: save graph to file :type fout: str :return: None :rtype: None """ with plt.style.context('ggplot'): # setup plot figure fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE')) ax.grid(color='white', linestyle='solid', linewidth=2) fig = plt.gcf() fig.set_dpi(100) fig.set_size_inches((8.0, 8.0), forward=True) plt.subplots_adjust(left=0.10, bottom=0.10, right=0.95, top=0.95) plt.gca().grid(True) plt.axis([-1.0, 1.0, -1.0, 1.0]) ttl = plt.title('MDS: top 1200 Venues') # get MDS coordinates for venues myMDS = MDS(2, verbose=0, n_jobs=-1, dissimilarity='precomputed') myMDS.fit(dmatrix) points = myMDS.embedding_ # add column to points to hold venue categories # TODO: higher-level categories from Foursquare's categories json points = np.c_[points, np.zeros(len(points))] categories = sq.get_categories(9) cat2num = {} for ind, cat in enumerate(categories): cat2num[cat] = ind for i, point in enumerate(points): point[2] = cat2num[self.vens[i].cat_name] # plot the points plt.scatter(points[:, 0], points[:, 1], marker='o', facecolor=points[:, 2], s=30, cmap=plt.get_cmap('jet'), edgecolor='black', linewidth=0.5, alpha=0.6) # TODO: make labels as mouse-overs or something like that # for label, x, y in zip(ven_names, points[:, 0], points[:, 1]): # plt.annotate( # label, # xy=(x, y), xytext=(0, 5), # textcoords='offset points', ha='center', va='bottom', # size='x-small') # adjust tick labels plt.tick_params(axis='both', which='major', labelsize=6, color='gray') plt.tick_params(axis='both', which='minor', labelsize=6, color='gray') # turn off ticks ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False if fout is None: plt.show() else: fig.savefig(fout)
def get_twodim_reps(reps, seed, distance=euclidean_distances): reps = reps.astype(np.float64) similarities = distance(reps) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=seed) return mds.fit(similarities).embedding_