def update_graph(amt, hrs): t0 = time.time() # First, filter based on the slider values time_mask = (gdf.Time >= hrs[0]) & (gdf.Time <= hrs[1]) amount_mask = (gdf.Amount >= amt[0]) & (gdf.Amount <= amt[1]) filt_df = gdf.loc[time_mask & amount_mask] # Then, select the features and train a UMAP model with cuML features = filt_df.loc[:, "V1":"V28"].values reducer = cuml.UMAP() embedding = reducer.fit_transform(features) # Convert the embedding back to numpy embedding = cp.asnumpy(embedding) amount = cp.asnumpy(filt_df.Amount.values.round(2)) # Create a plotly.express scatter plot fig = px.scatter( x=embedding[:, 0], y=embedding[:, 1], color=amount, labels={"color": "Amount ($)"}, title="UMAP projection of credit card transactions", ) t1 = time.time() out_msg = f"Projected {embedding.shape[0]} transactions in {t1-t0:.2f}s." alert = dbc.Alert(out_msg, color="success", dismissable=True) return fig, alert
def get_UMAP_prjs(input_data, cpu=True, **kwargs): "Compute the projections of `input_data` using UMAP, with a configuration contained in `**kwargs`." warnings.filterwarnings( "ignore", category=NumbaPerformanceWarning) # silence NumbaPerformanceWarning reducer = umap.UMAP(**kwargs) if cpu else cuml.UMAP(**kwargs) projections = reducer.fit_transform(input_data) return projections
def getMapper(self, X, y=None, **kwargs): if self._mapper is None: t0 = time.time() print(f"Computing embedding, input shape = {X.shape}") input_data: cudf.DataFrame = self.getDataFrame(X) self._mapper = cuml.UMAP(init=self.init, n_neighbors=self.n_neighbors, n_components=self.n_components, n_epochs=self.n_epochs, min_dist=self.min_dist, output_type="numpy") self._mapper.fit(input_data) print( f"Completed umap fit in time {time.time() - t0} sec, embedding shape = {self._embedding_.shape}" ) return self._mapper
def cuml_umap(config, feature): # Import RAPIDS import cudf, cuml print("INSIDE CUML_UMAP") print(feature.shape) num_fr = feature.shape[0] embed = np.zeros((num_fr, config['n_components'])) # TRY THIS LATER!!!!!!!!!!!!!!!!! IF YOU EVER RUN OUT OF SPACE; COMPARE EMBEDDINGS AND SEE IF SMALLER DATA MAKES A DIFFERENCE # df = cudf.DataFrame(feature, dtype='float32') df = cudf.DataFrame(feature) cu_embed = cuml.UMAP(n_components=config['n_components'], n_neighbors=config['n_neighbors'], n_epochs=config['n_epochs'], min_dist=config['min_dist'], spread=config['spread'], negative_sample_rate=config['negative_sample_rate'], init=config['init'], repulsion_strength=config['repulsion_strength'], output_type='numpy').fit_transform(df) embed[:,0:config['n_components']] = cu_embed return embed
def prep_trumap_inputs( interpret_session: InterpretationSession, sid_idx: torch.Tensor, topk_sim_full_embeds: List, target_text: str, target_pred_ind: int) -> Tuple[List[Tuple], Tuple, Tuple]: full_embed_df = cudf.DataFrame([tuple(k) for k in topk_sim_full_embeds]) umap_xformed_embeds = cuml.UMAP( n_neighbors=5, n_components=3, n_epochs=500, min_dist=0.1).fit_transform(full_embed_df).as_matrix() umap_normed_embeds = (umap_xformed_embeds / np.linalg.norm(umap_xformed_embeds)).tolist() max_truth_umap, max_falsehood_umap, target_sent_umap = [ umap_normed_embeds.pop() for _ in range(3) ] trumap_spectrum = [] for idx, umapval in zip(sid_idx.tolist(), umap_normed_embeds): trumap_spectrum.append( (interpret_session.stmt_embed_dict['stext'][idx], interpret_session.stmt_embed_dict['labels'][idx], tuple(umapval))) target_token_tup = (target_text, target_pred_ind, tuple(target_sent_umap)) umap_bounds_tup = (tuple(max_falsehood_umap), tuple(max_truth_umap)) return trumap_spectrum, target_token_tup, umap_bounds_tup
def reduce_to_3D(data, labels, dimReductionMethod, trainedEmbeddingModel=None): startTime = time.time() preTrainedStr = '' ''' if dimReductionMethod == 'TSNE': embeddingModel = None embeddedData = cuml.TSNE( n_components = 2 ).fit_transform ( X = data ) embeddedData.add_column('3', cudf.Series(np.zeros((data.shape[0]))) ) else: ''' if trainedEmbeddingModel is not None: preTrainedStr = 'pre-trained ' embeddingModel = trainedEmbeddingModel else: if dimReductionMethod == 'PCA': embeddingModel = cuml.PCA(copy=True, n_components=3, random_state=0, svd_solver='full', verbose=True, whiten=False).fit(X=data) elif dimReductionMethod == 'UMAP': embeddingModel = cuml.UMAP(n_components=3).fit(X=data, y=labels) else: assert ('unable to find embedding model match to user query') embeddedData = embeddingModel.transform(X=data) elapsedTime = time.time() - startTime print( f'{embeddedData.shape} via {preTrainedStr}{dimReductionMethod} -- completed in: {elapsedTime:.3f} seconds' ) return embeddedData, embeddingModel
cluster_models = dict(KMeans=cuml.KMeans()) decomposition_models = dict( PCA=cuml.PCA(), TruncatedSVD=cuml.TruncatedSVD(), ) decomposition_models_xfail = dict( GaussianRandomProjection=cuml.GaussianRandomProjection(), SparseRandomProjection=cuml.SparseRandomProjection()) neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors()) dbscan_model = dict(DBSCAN=cuml.DBSCAN()) umap_model = dict(UMAP=cuml.UMAP()) def unit_param(*args, **kwargs): return pytest.param(*args, **kwargs, marks=pytest.mark.unit) def quality_param(*args, **kwargs): return pytest.param(*args, **kwargs, marks=pytest.mark.quality) def stress_param(*args, **kwargs): return pytest.param(*args, **kwargs, marks=pytest.mark.stress) def pickle_save_load(tmpdir, model):
decomposition_models = { "PCA": lambda: cuml.PCA(), "TruncatedSVD": lambda: cuml.TruncatedSVD(), } decomposition_models_xfail = { "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(), "SparseRandomProjection": lambda: cuml.SparseRandomProjection() } neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()} dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()} umap_model = {"UMAP": lambda: cuml.UMAP()} rf_models = { "rfc": lambda: cuml.RandomForestClassifier(), "rfr": lambda: cuml.RandomForestRegressor() } all_models = { **regression_models, **solver_models, **cluster_models, **decomposition_models, **decomposition_models_xfail, **neighbor_models, **dbscan_model, **umap_model,
def plot_comparison(checkpoint_id, num_progressions=5, num_classes=20, num_samples=10000, algorithm='umap'): checkpoints = os.listdir('pcl_cifar10_{}'.format(checkpoint_id[0])) checkpoints.sort() assert num_progressions <= len( checkpoints), 'Not enough checkpoints saved.' checkpoints = [ checkpoints[i] for i in range(0, len(checkpoints), len(checkpoints) // (num_progressions - 1)) ][:num_progressions - 1] + [checkpoints[-1]] # fig, axes = plt.subplots(len(checkpoints), 2, figsize=(30,50)) fig, axes = plt.subplots(2, len(checkpoints), figsize=(60, 30)) print(checkpoints) for i, checkpoint_file in enumerate(checkpoints): # print(checkpoints) # print(checkpoint_file) epoch = checkpoint_file[11:15] # just the 4-digit checkpoint epoch print('epoch: {}'.format(epoch)) checkpoint = torch.load('pcl_cifar10_{}/{}'.format( checkpoint_id[0], checkpoint_file)) model.load_state_dict(checkpoint['state_dict']) features, classes = compute_features(eval_loader, model, low_dim=low_dim, gpu=gpu) features[ torch.norm(features, dim=1) > 1.5] /= 2 #account for the few samples that are computed twice features = features.numpy() restricted_classes = np.array([i for i in classes if i < num_classes]) features = features[np.array(classes) < num_classes] features = features[:num_samples] restricted_classes = restricted_classes[:num_samples] if algorithm == 'umap': # reducer = umap.UMAP(n_neighbors = 60, min_dist=0.1, n_components=2, metric='cosine') reducer = cuml.UMAP(n_neighbors=60, min_dist=0.1, n_components=2, n_epochs=1000) y = reducer.fit_transform(features) elif algorithm == 'tsne': tsne = cudaTSNE(n_components=2, perplexity=50, learning_rate=600, verbose=1, n_iter=2500, metric='euclidean') y = tsne.fit_transform(features) scatter = axes.flat[i].scatter(y[:, 0], y[:, 1], c=restricted_classes, cmap='Spectral', s=3) with torch.no_grad(): results = run_dbscan(features, minPts=200, minSamples=0, temperature=0.2, eps=0.3) # results = run_kmeans(features, num_cluster=['250']) im2cluster = results['im2cluster'][0].tolist( ) # remember to turn this back to a list scatter = axes.flat[i + num_progressions].scatter( y[:, 0], y[:, 1], c=im2cluster, cmap='Spectral', s=3) # restricting num_classes does not work here # legend = axes.flat[i].legend(*scatter.legend_elements(), loc='lower left', title="Classes") # axes.flat[i].add_artist(legend) axes.flat[i].set_title('Cifar Classes, epoch: {}'.format(epoch)) axes.flat[i + num_progressions].set_title( 'Clustering Classes, epoch: {}'.format(epoch)) # axes.flat[-1].legend(*scatter.legend_elements(), loc='lower left', title="Classes", bbox_to_anchor=(1.00, 0), prop={'size': 25}) fig.suptitle('{}_{} Comparison'.format(algorithm, checkpoint_id[0]), fontsize=50) if not os.path.exists('imgs/{}_{}'.format(algorithm, checkpoint_id[0])): os.makedirs('imgs/{}_{}'.format(algorithm, checkpoint_id[0])) save_path = 'imgs/{}_{}/comparison'.format(algorithm, checkpoint_id[0]) fig.savefig(save_path) print('Figure saved to : {}'.format(save_path))
print('PCA has been skipped') task_start_time = datetime.now() n_clusters = 7 if enable_gpu: kmeans_float = cuml.KMeans(n_clusters=n_clusters) else: kmeans_float = sklearn.cluster.KMeans(n_clusters=n_clusters) kmeans_float.fit(df_fingerprints) print('Runtime Kmeans time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) # UMAP task_start_time = datetime.now() if enable_gpu: umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) else: umap = umap.UMAP() Xt = umap.fit_transform(df_fingerprints) print('Runtime UMAP time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) if enable_gpu: df_fingerprints.add_column('x', Xt[0].to_array()) df_fingerprints.add_column('y', Xt[1].to_array()) df_fingerprints.add_column('cluster', kmeans_float.labels_) else: df_fingerprints['x'] = Xt[:, 0] df_fingerprints['y'] = Xt[:, 1] df_fingerprints['cluster'] = kmeans_float.labels_
distances = cupy.ravel(cupy.fromDlpack(dist_mlarr.to_dlpack())) indices = cupy.ravel(cupy.fromDlpack(ind_mlarr.to_dlpack())) print( f"Computed KNN graph, distances shape = {distances.shape}, indices shape = {indices.shape}, distances[0:5]= {distances[0:5]}, indices[0:5]= {indices[0:5]}" ) n_samples = indices.shape[0] n_nonzero = n_samples * n_neighbors rowptr = cupy.arange(0, n_nonzero + 1, n_neighbors) knn_graph = cupyx.scipy.sparse.csr_matrix((distances, indices, rowptr), shape=(n_samples, n_samples)) print(f"Completed KNN, graph shape = {knn_graph.shape}") reducer = cuml.UMAP(n_neighbors=15, n_components=3, n_epochs=500, min_dist=0.1, output_type="numpy") embedding = reducer.fit_transform(data, knn_graph=knn_graph) print(f"Completed embedding, shape = {embedding.shape}") # df = embedding.to_pandas() # df.columns = ["x", "y"] # df['class'] = pd.Series([str(x) for x in target.to_array()], dtype="category") # # cvs = ds.Canvas(plot_width=400, plot_height=400) # agg = cvs.points(df, 'x', 'y', ds.count_cat('class')) # img = tf.shade(agg, color_key=color_key, how='eq_hist') # # utils.export_image(img, filename='fashion-mnist', background='black') #
decomposition_models_xfail = { "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(), "SparseRandomProjection": lambda: cuml.SparseRandomProjection() } neighbor_models = { "NearestNeighbors": lambda: cuml.NearestNeighbors() } dbscan_model = { "DBSCAN": lambda: cuml.DBSCAN() } umap_model = { "UMAP": lambda: cuml.UMAP() } rf_classification_model = { "rfc": lambda: cuml.RandomForestClassifier() } rf_regression_model = { "rfr": lambda: cuml.RandomForestRegressor() } def pickle_save_load(tmpdir, func_create_model, func_assert): model, X_test = func_create_model() pickle_file = tmpdir.join('cu_model.pickle')
def __init__(self, df, n_clusters, chembl_ids, enable_gpu=True, pca_model=False): self.app = dash.Dash( __name__, external_stylesheets=external_stylesheets) self.df = df self.n_clusters = n_clusters self.chembl_ids = chembl_ids self.enable_gpu = enable_gpu self.pca = pca_model # Fetch relavant properties from database. self.prop_df = self.create_dataframe_molecule_properties(chembl_ids) self.df['chembl_id'] = chembl_ids self.df['id'] = self.df.index self.orig_df = df.copy() # initialize UMAP if enable_gpu: self.umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) else: self.umap = umap.UMAP() # Construct the UI self.app.layout = self.constuct_layout() # Register callbacks for selection inside main figure self.app.callback( [Output('selected_clusters', 'value'), Output('selected_point_cnt', 'children')], [Input('main-figure', 'clickData'), Input('main-figure', 'selectedData'), Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('northstar_cluster', 'children')], [State("selected_clusters", "value")]) (self.handle_data_selection) # Register callbacks for buttons for reclustering selected data self.app.callback( [Output('main-figure', 'figure'), Output('northstar_cluster', 'children'), Output('north_star_clusterid_map', 'children')], [Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('bt_north_star', 'n_clicks'), Input('hidden_northstar', 'value'), Input('sl_prop_gradient', 'value'), Input('sl_nclusters', 'value'),], [State("selected_clusters", "value"), State("main-figure", "selectedData"), State('north_star', 'value'),]) (self.handle_re_cluster) # Register callbacks for selection inside main figure to update module details self.app.callback( [Output('tb_selected_molecules', 'children'), Output('sl_mol_props', 'options'), Output('current_page', 'children'), Output('total_page', 'children'), Output('section_molecule_details', 'style')], [Input('main-figure', 'selectedData'), Input('sl_mol_props', 'value'), Input('sl_prop_gradient', 'value'), Input('bt_page_prev', 'n_clicks'), Input('bt_page_next', 'n_clicks'), Input('north_star_clusterid_map', 'children')], State('current_page', 'children')) (self.handle_molecule_selection) self.app.callback( Output("hidden1", "children"), [Input("bt_reset", "n_clicks")]) (self.handle_reset) self.app.callback( [Output('north_star', 'value'), Output('hidden_northstar', 'value')], [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')], State('north_star', 'value')) \ (self.handle_mark_north_star)
import pandas as pd import cudf, cuml df = pd.read_csv("data/data.csv") columns = [ 'name', 'artists', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence' ] df_mod = df[columns] keys = df_mod.iloc[:, :2].values.tolist() features = df_mod.iloc[:, 2:].to_numpy() features = (features - features.min()) / (features.max() - features.min()) df = cudf.DataFrame(features) embed = cuml.UMAP(n_neighbors=20, n_epochs=100, min_dist=0.1, init='spectral').fit_transform(df) np_embed = embed.to_pandas().to_numpy() np.save("result/embeddings.npy", np_embed)