def test_umap_transform_embedding_stability(): """Test that transforming data does not alter the learned embeddings Issue #217 describes how using transform to embed new data using a trained UMAP transformer causes the fitting embedding matrix to change in cases when the new data has the same number of rows as the original training data. """ data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) original_embedding = fitter.embedding_.copy() # The important point is that the new data has the same number of rows # as the original fit data new_data = np.random.random(data.shape) embedding = fitter.transform(new_data) assert_array_equal( original_embedding, fitter.embedding_, "Transforming new data changed the original embeddings", ) # Example from issue #217 a = np.random.random((1000, 10)) b = np.random.random((1000, 5)) umap = UMAP() u1 = umap.fit_transform(a[:, :5]) u1_orig = u1.copy() assert_array_equal(u1_orig, umap.embedding_) u2 = umap.transform(b) assert_array_equal(u1_orig, umap.embedding_)
def metrics(model, data_iterator): """ Summary: Args: Returns: """ umap_proj = UMAP(metric='euclidean', n_neighbors=200, low_memory=True) hdb_clusterer = hdbscan.HDBSCAN( min_samples=100, min_cluster_size=100, ) ads_pred = [] ads_actual = [] total_duration = [] pred_ads_duration = [] for i, (data, labels) in tqdm(enumerate(data_iterator)): aud_len = MP3_META(data).info.length total_duration.append(aud_len) aud_data = load_audio(data) embeds, (aud_splits, _) = encoder.embed(aud_data, group=False) print(data, "Embed done") try: projs = umap_proj.fit_transform(embeds) print(data, "Created Projections") except Exception as e: print(e) continue clusters = hdb_clusterer.fit_predict(projs) print(data, "Created Clusters") ad_dir, ads = segment_ads(aud_data, aud_splits, data, clusters) pred_ads_duration.append(len(ads) * 10) ads_pred.append(len(ads)) ads_actual.append(labels) print(data, "Done segmenting ads") plt.scatter(projs[:, 0], projs[:, 1], cmap='Spectral') plt.title(str(Counter(clusters))) plt.savefig('{}/{}_umap.jpg'.format(ad_dir, data.split('/')[-1])) plt.close() plt.plot(clusters) plt.savefig('{}/{}_hdb_labels.jpg'.format(ad_dir, data.split('/')[-1])) plt.close() continue
X = dataset.data y = dataset.target # Generate shape graph using KeplerMapper mapper = KeplerMapper(verbose=1) lens = mapper.fit_transform(X, projection=[0]) graph = mapper.map(lens, X, nr_cubes=6, overlap_perc=0.2) # Convert to a DyNeuGraph dG = DyNeuGraph(G=graph, y=y) # Define some custom_layouts dG.add_custom_layout(lens, name='lens') dG.add_custom_layout(nx.spring_layout, name='nx.spring') dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai') dG.add_custom_layout(nx.spectral_layout, name='nx.spectral') dG.add_custom_layout(nx.circular_layout, name='nx.circular') # Configure some projections pca = PCA(2, random_state=1) tsne = TSNE(2, init='pca', random_state=1) umap = UMAP(n_components=2, init=pca.fit_transform(X)) # Add projections as custom_layouts dG.add_custom_layout(pca.fit_transform(X), name='PCA') dG.add_custom_layout(tsne.fit_transform(X), name='TSNE') dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP') # Visualize dG.visualize(static=True, show=True)
# Extract sessions 4-5 mask_sessions = df.chunks.add(1).isin([4, 5]) X = X[mask_sessions] y = y.loc[mask_sessions, :] target = target[mask_sessions] # Generate a shape graph using KeplerMapper mapper = KeplerMapper(verbose=1) # Configure projection pca = PCA(2, random_state=1) umap = UMAP(n_components=2, init=pca.fit_transform(X)) # Construct lens and generate the shape graph lens = mapper.fit_transform(umap.fit_transform(X, y=target), projection=[0, 1]) graph = mapper.map( lens, X=X, cover=Cover(20, 0.5), clusterer=optimize_dbscan(X, k=3, p=100.0), ) # Convert to a DyNeuGraph dG = DyNeuGraph(G=graph, y=y) # Define some custom_layouts dG.add_custom_layout(lens, name='lens') dG.add_custom_layout(nx.spring_layout, name='nx.spring') dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai') dG.add_custom_layout(nx.spectral_layout, name='nx.spectral')
class ClusteringWidget(QSplitter): def __init__(self, headermodel, selectionmodel): super(ClusteringWidget, self).__init__() self.headermodel = headermodel self.selectionmodel = selectionmodel # init some values self.selectMapidx = 0 self.embedding = None self.labels = None self.mean_spectra = None # split between cluster image and scatter plot self.image_and_scatter = QSplitter() # split between image&scatter and spec plot, vertical split self.leftsplitter = QSplitter() self.leftsplitter.setOrientation(Qt.Vertical) # split between params, buttons and map list, vertical split self.rightsplitter = QSplitter() self.rightsplitter.setOrientation(Qt.Vertical) self.clusterImage = MapViewWidget() self.clusterScatterPlot = ScatterPlotWidget() self.rawSpecPlot = SpectraPlotWidget() self.clusterMeanPlot = ClusterSpectraWidget() # ParameterTree self.parametertree = ClusteringParameters() self.parameter = self.parametertree.parameter # buttons layout self.buttons = QWidget() self.buttonlayout = QGridLayout() self.buttons.setLayout(self.buttonlayout) # set up buttons self.fontSize = 12 font = QFont("Helvetica [Cronyx]", self.fontSize) self.computeBtn = QPushButton() self.computeBtn.setText('Compute clusters') self.computeBtn.setFont(font) self.saveBtn = QPushButton() self.saveBtn.setText('Save clusters') self.saveBtn.setFont(font) # add all buttons self.buttonlayout.addWidget(self.computeBtn) self.buttonlayout.addWidget(self.saveBtn) # Headers listview self.headerlistview = QListView() self.headerlistview.setModel(headermodel) self.headerlistview.setSelectionModel( selectionmodel) # This might do weird things in the map view? self.headerlistview.setSelectionMode(QListView.SingleSelection) # add title to list view self.mapListWidget = QWidget() self.listLayout = QVBoxLayout() self.mapListWidget.setLayout(self.listLayout) mapListTitle = QLabel('Maps list') mapListTitle.setFont(font) self.listLayout.addWidget(mapListTitle) self.listLayout.addWidget(self.headerlistview) # assemble widgets self.image_and_scatter.addWidget(self.clusterImage) self.image_and_scatter.addWidget(self.clusterScatterPlot) self.leftsplitter.addWidget(self.image_and_scatter) self.leftsplitter.addWidget(self.rawSpecPlot) self.leftsplitter.addWidget(self.clusterMeanPlot) self.leftsplitter.setSizes([200, 50, 50]) self.rightsplitter.addWidget(self.parametertree) self.rightsplitter.addWidget(self.buttons) self.rightsplitter.addWidget(self.mapListWidget) self.rightsplitter.setSizes([300, 50, 50]) self.addWidget(self.leftsplitter) self.addWidget(self.rightsplitter) self.setSizes([500, 100]) # setup ROI item sideLen = 10 self.roi = PolyLineROI(positions=[[0, 0], [sideLen, 0], [sideLen, sideLen], [0, sideLen]], closed=True) self.roi.hide() self.roiInitState = self.roi.getState() # set up mask item self.maskItem = ImageItem(np.ones((1, 1)), axisOrder="row-major", autoLevels=True, opacity=0.3) self.maskItem.hide() # set up select mask item self.selectMaskItem = ImageItem(np.ones((1, 1)), axisOrder="row-major", autoLevels=True, opacity=0.3, lut=np.array([[0, 0, 0], [255, 0, 0]])) self.selectMaskItem.hide() self.clusterImage.view.addItem(self.roi) self.clusterImage.view.addItem(self.maskItem) self.clusterImage.view.addItem(self.selectMaskItem) # Connect signals self.computeBtn.clicked.connect(self.computeEmbedding) self.saveBtn.clicked.connect(self.saveCluster) self.clusterImage.sigShowSpectra.connect(self.rawSpecPlot.showSpectra) self.clusterImage.sigShowSpectra.connect( self.clusterScatterPlot.clickFromImage) self.clusterScatterPlot.sigScatterRawInd.connect( self.rawSpecPlot.showSpectra) self.clusterScatterPlot.sigScatterClicked.connect(self.showClusterMean) self.clusterScatterPlot.sigScatterRawInd.connect(self.setImageCross) self.parametertree.sigParamChanged.connect(self.updateClusterParams) self.selectionmodel.selectionChanged.connect(self.updateMap) self.selectionmodel.selectionChanged.connect(self.updateRoiMask) def computeEmbedding(self): # get current map idx if not self.isMapOpen(): return msg.showMessage('Compute embedding.') # Select wavenumber region wavROIList = [] for entry in self.parameter['Wavenumber Range'].split(','): try: wavROIList.append(val2ind(int(entry), self.wavenumbers)) except: continue if len(wavROIList) % 2 == 0: wavROIList = sorted(wavROIList) wavROIidx = [] for i in range(len(wavROIList) // 2): wavROIidx += list( range(wavROIList[2 * i], wavROIList[2 * i + 1] + 1)) else: msg.logMessage('"Wavenumber Range" values must be in pairs', msg.ERROR) MsgBox('Clustering computation aborted.', 'error') return self.wavenumbers_select = self.wavenumbers[wavROIidx] self.N_w = len(self.wavenumbers_select) # get current dataset if self.selectedPixels is None: n_spectra = len(self.data) self.dataset = np.zeros((n_spectra, self.N_w)) for i in range(n_spectra): self.dataset[i, :] = self.data[i][wavROIidx] else: n_spectra = len(self.selectedPixels) self.dataset = np.zeros((n_spectra, self.N_w)) for i in range(n_spectra): # i: ith selected pixel row_col = tuple(self.selectedPixels[i]) self.dataset[i, :] = self.data[self.rc2ind[row_col]][wavROIidx] # get parameters and compute embedding n_components = self.parameter['Components'] if self.parameter['Embedding'] == 'UMAP': n_neighbors = self.parameter['Neighbors'] metric = self.parameter['Metric'] min_dist = np.clip(self.parameter['Min Dist'], 0, 1) self.umap = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=0) self.embedding = self.umap.fit_transform(self.dataset) elif self.parameter['Embedding'] == 'PCA': # normalize and mean center if self.parameter['Normalization'] == 'L1': # normalize data_norm = Normalizer(norm='l1').fit_transform(self.dataset) elif self.parameter['Normalization'] == 'L2': data_norm = Normalizer(norm='l2').fit_transform(self.dataset) else: data_norm = self.dataset # subtract mean data_centered = StandardScaler( with_std=False).fit_transform(data_norm) # Do PCA self.PCA = PCA(n_components=n_components) self.PCA.fit(data_centered) self.embedding = self.PCA.transform(data_centered) # save embedding to standardModelItem self.item.embedding = self.embedding # update cluster map self.computeCluster() def computeCluster(self): # check if embeddings exist if self.embedding is None: return msg.showMessage('Compute clusters.') # get num of clusters n_clusters = self.parameter['Clusters'] # set colorLUT self.colorLUT = cm.get_cmap('viridis', n_clusters + 1).colors[:, :3] * 255 # compute cluster cluster_object = KMeans(n_clusters=n_clusters, random_state=0).fit(self.embedding) self.labels = cluster_object.labels_ + 1 # update cluster image if self.selectedPixels is None: # full map self.cluster_map = self.labels.reshape(self.imgShape[0], self.imgShape[1]) elif self.selectedPixels.size == 0: self.cluster_map = np.zeros((self.imgShape[0], self.imgShape[1]), dtype=int) else: self.cluster_map = np.zeros((self.imgShape[0], self.imgShape[1]), dtype=int) self.cluster_map[self.selectedPixels[:, 0], self.selectedPixels[:, 1]] = self.labels self.cluster_map = np.flipud(self.cluster_map) self.clusterImage.setImage(self.cluster_map, levels=[0, n_clusters]) # self.clusterImage.setImage(self.cluster_map) self.clusterImage._image = self.cluster_map self.clusterImage.rc2ind = self.rc2ind self.clusterImage.row, self.clusterImage.col = self.imgShape[ 0], self.imgShape[1] self.clusterImage.txt.setPos(self.clusterImage.col, 0) self.clusterImage.cross.show() # update cluster mean mean_spectra = [] self.dfGroups = [] if self.selectedPixels is None: n_spectra = len(self.data) self.dataList = np.zeros((n_spectra, len(self.wavenumbers))) dataIdx = np.arange(n_spectra) for i in range(n_spectra): self.dataList[i] = self.data[i] else: n_spectra = len(self.selectedPixels) self.dataList = np.zeros((n_spectra, len(self.wavenumbers))) dataIdx = np.zeros(n_spectra, dtype=int) for i in range(n_spectra): # i: ith selected pixel row_col = tuple(self.selectedPixels[i]) dataIdx[i] = self.rc2ind[row_col] self.dataList[i] = self.data[dataIdx[i]] for ii in range(1, n_clusters + 1): sel = (self.labels == ii) # save each group spectra to a dataFrame self.dfGroups.append( pd.DataFrame(self.dataList[sel], columns=self.wavenumbers.tolist(), index=dataIdx[sel])) this_mean = np.mean(self.dataset[sel, :], axis=0) mean_spectra.append(this_mean) self.mean_spectra = np.vstack(mean_spectra) self.clusterMeanPlot.setColors(self.colorLUT) self.clusterMeanPlot._data = self.mean_spectra self.clusterMeanPlot.wavenumbers = self.wavenumbers_select self.clusterMeanPlot.plotClusterSpectra() # update scatter plot self.updateScatterPlot() def saveCluster(self): if hasattr(self, 'cluster_map') and hasattr(self, 'mean_spectra'): filePath = self.pathList[self.selectMapidx] # get dirname and old filename dirName = os.path.dirname(filePath) oldFileName = os.path.basename(filePath) n_clusters = self.parameter['Clusters'] for i in range(n_clusters): # save dataFrames to csv file csvName = oldFileName[:-3] + f'_cluster{i+1}.csv' newFilePath = os.path.join(dirName, csvName) self.dfGroups[i].to_csv(newFilePath) MsgBox( f'Cluster spectra groups were successfully saved at: {newFilePath}!' ) def updateScatterPlot(self): if (self.embedding is None) or (self.labels is None): return # get scatter x, y values self.clusterScatterPlot.scatterData = self.embedding[:, [ self.parameter['X Component'] - 1, self.parameter['Y Component'] - 1 ]] # get colormapings brushes = [mkBrush(self.colorLUT[x, :]) for x in self.labels] # make plots if hasattr(self, 'scatterPlot'): self.clusterScatterPlot.plotItem.clearPlots() self.scatterPlot = self.clusterScatterPlot.plotItem.plot( self.clusterScatterPlot.scatterData, pen=None, symbol='o', symbolBrush=brushes) self.clusterScatterPlot.getViewBox().autoRange(padding=0.1) self.clusterScatterPlot.getNN() def updateClusterParams(self, name): if name == 'Components': self.computeEmbedding() elif name == 'Clusters': self.computeCluster() elif name in ['X Component', 'Y Component']: self.updateScatterPlot() def updateMap(self): # get current map idx if not self.selectionmodel.selectedIndexes(): # no map is open return else: self.selectMapidx = self.selectionmodel.selectedIndexes()[0].row() # get current item self.item = self.headermodel.item(self.selectMapidx) if hasattr(self.item, 'embedding'): # compute embedding self.computeEmbedding() else: # reset custer image and plots self.cleanUp() def showClusterMean(self, i): if self.mean_spectra is None: return self.clusterMeanPlot.curveHighLight(self.labels[i] - 1) def setImageCross(self, ind): row, col = self.ind2rc[ind] # update cross self.clusterImage.cross.setData([col + 0.5], [self.imgShape[0] - row - 0.5]) # update text self.clusterImage.txt.setHtml( toHtml(f'Point: #{ind}', size=8) + toHtml(f'X: {col}', size=8) + toHtml(f'Y: {row}', size=8) + toHtml( f'Val: {self.clusterImage._image[self.imgShape[0] - row - 1, col] :d}', size=8)) def cleanUp(self): if self.selectionmodel.hasSelection(): self.selectMapIdx = self.selectionmodel.selectedIndexes()[0].row() elif self.headermodel.rowCount() > 0: self.selectMapIdx = 0 else: return if hasattr(self, 'imgShapes') and (self.selectMapIdx < len(self.imgShapes)): # self.clusterImage.clear() img = np.zeros((self.imgShapes[self.selectMapIdx][0], self.imgShapes[self.selectMapIdx][1])) self.clusterImage.setImage(img=img) if hasattr(self, 'scatterPlot'): self.clusterScatterPlot.plotItem.clearPlots() self.clusterScatterPlot.scatterData = None self.rawSpecPlot.clearAll() self.rawSpecPlot._data = None self.clusterMeanPlot.clearAll() self.clusterMeanPlot._data = None def updateRoiMask(self): if self.selectionmodel.hasSelection(): self.selectMapIdx = self.selectionmodel.selectedIndexes()[0].row() elif self.headermodel.rowCount() > 0: self.selectMapIdx = 0 else: return # update roi try: roiState = self.headermodel.item(self.selectMapIdx).roiState if roiState[0]: # roi on self.roi.show() else: self.roi.hide() # update roi state self.roi.blockSignals(True) self.roi.setState(roiState[1]) self.roi.blockSignals(False) except Exception: self.roi.hide() # update automask try: maskState = self.headermodel.item(self.selectMapIdx).maskState self.maskItem.setImage(maskState[1]) if maskState[0]: # automask on self.maskItem.show() else: self.maskItem.hide() except Exception: pass # update selectMask try: selectMaskState = self.headermodel.item( self.selectMapIdx).selectState self.selectMaskItem.setImage(selectMaskState[1]) if selectMaskState[0]: # selectmask on self.selectMaskItem.show() else: self.selectMaskItem.hide() except Exception: pass def setHeader(self, field: str): self.headers = [ self.headermodel.item(i).header for i in range(self.headermodel.rowCount()) ] self.field = field self.wavenumberList = [] self.imgShapes = [] self.rc2indList = [] self.ind2rcList = [] self.pathList = [] self.dataSets = [] # get wavenumbers, imgShapes, rc2ind for header in self.headers: dataEvent = next(header.events(fields=[field])) self.wavenumberList.append(dataEvent['wavenumbers']) self.imgShapes.append(dataEvent['imgShape']) self.rc2indList.append(dataEvent['rc_index']) self.ind2rcList.append(dataEvent['index_rc']) self.pathList.append(dataEvent['path']) # get raw spectra data = None try: # spectra datasets data = header.meta_array('spectra') except IndexError: msg.logMessage( 'Header object contained no frames with field ' '{field}' '.', msg.ERROR) if data is not None: self.dataSets.append(data) self.cleanUp() def isMapOpen(self): if not self.selectionmodel.selectedIndexes(): # no map is open return False else: self.selectMapidx = self.selectionmodel.selectedIndexes()[0].row() # get current data self.item = self.headermodel.item(self.selectMapidx) self.selectedPixels = self.item.selectedPixels self.clusterScatterPlot.selectedPixels = self.selectedPixels self.currentHeader = self.headers[self.selectMapidx] self.wavenumbers = self.wavenumberList[self.selectMapidx] self.rc2ind = self.rc2indList[self.selectMapidx] self.ind2rc = self.ind2rcList[self.selectMapidx] self.clusterScatterPlot.ind2rc = self.ind2rc self.clusterScatterPlot.rc2ind = self.rc2ind self.imgShape = self.imgShapes[self.selectMapidx] self.data = self.dataSets[self.selectMapidx] self.rawSpecPlot.setHeader(self.currentHeader, 'spectra') if self.selectedPixels is not None: self.clusterScatterPlot.selPx_rc2ind = { tuple(self.selectedPixels[i]): i for i in range(len(self.selectedPixels)) } self.clusterScatterPlot.selPx_ind2rc = { i: tuple(self.selectedPixels[i]) for i in range(len(self.selectedPixels)) } return True
projection=[0, 1]) graph = mapper.map( lens, X=X, cover=Cover(20, 0.5), clusterer=optimize_dbscan(X, k=3, p=100.0), ) # Convert to a DyNeuGraph dG = DyNeuGraph(G=graph, y=y) # Define some custom_layouts dG.add_custom_layout(lens, name='lens') dG.add_custom_layout(nx.spring_layout, name='nx.spring') dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai') dG.add_custom_layout(nx.spectral_layout, name='nx.spectral') dG.add_custom_layout(nx.circular_layout, name='nx.circular') # Configure some projections pca = PCA(2, random_state=1) tsne = TSNE(2, init='pca', random_state=1) umap = UMAP(n_components=2, init=pca.fit_transform(X)) # Add projections as custom_layouts dG.add_custom_layout(pca.fit_transform(X), name='PCA') dG.add_custom_layout(tsne.fit_transform(X), name='TSNE') dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP') dG.add_custom_layout(umap.fit_transform(X, y=target), name='Supervised UMAP') # Visualize dG.visualize(static=True, show=True)