def init_embedding_from_graph(_raw_data, graph, n_components, random_state, metric, _metric_kwds, init="spectral"): """Initialize embedding using graph. This is for direct embeddings. Parameters ---------- init : str, optional Type of initialization to use. Either random, or spectral, by default "spectral" Returns ------- embedding : np.array the initialized embedding """ if random_state is None: random_state = check_random_state(None) if isinstance(init, str) and init == "random": embedding = random_state.uniform( low=-10.0, high=10.0, size=(graph.shape[0], n_components)).astype(np.float32) elif isinstance(init, str) and init == "spectral": # We add a little noise to avoid local minima for optimization to come initialisation = spectral_layout( _raw_data, graph, n_components, random_state, metric=metric, metric_kwds=_metric_kwds, ) expansion = 10.0 / np.abs(initialisation).max() embedding = (initialisation * expansion).astype( np.float32) + random_state.normal( scale=0.0001, size=[graph.shape[0], n_components]).astype( np.float32) else: init_data = np.array(init) if len(init_data.shape) == 2: if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]: tree = KDTree(init_data) dist, ind = tree.query(init_data, k=2) nndist = np.mean(dist[:, 1]) embedding = init_data + random_state.normal( scale=0.001 * nndist, size=init_data.shape).astype( np.float32) else: embedding = init_data return embedding
def fit(self, X, y=None, **fit_params): if "relations" not in fit_params: raise ValueError( "Aligned UMAP requires relations between data to be " "specified") self.dict_relations_ = fit_params["relations"] assert type(self.dict_relations_) in (list, tuple) assert type(X) in (list, tuple, np.ndarray) assert (len(X) - 1) == (len(self.dict_relations_)) # We need n_components to be constant or this won't work if type(self.n_components) in (list, tuple, np.ndarray): raise ValueError( "n_components must be a single integer, and cannot vary") self.n_models_ = len(X) self.mappers_ = [ UMAP( n_neighbors=get_nth_item_or_val(self.n_neighbors, n), min_dist=get_nth_item_or_val(self.min_dist, n), n_epochs=get_nth_item_or_val(self.n_epochs, n), repulsion_strength=get_nth_item_or_val(self.repulsion_strength, n), learning_rate=get_nth_item_or_val(self.learning_rate, n), spread=get_nth_item_or_val(self.spread, n), negative_sample_rate=get_nth_item_or_val( self.negative_sample_rate, n), local_connectivity=get_nth_item_or_val(self.local_connectivity, n), set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio, n), unique=get_nth_item_or_val(self.unique, n), n_components=self.n_components, ).fit(X[n]) for n in range(self.n_models_) ] if self.n_epochs is None: n_epochs = 200 else: n_epochs = self.n_epochs window_size = fit_params.get("window_size", self.alignment_window_size) relations = expand_relations(self.dict_relations_, window_size) indptr_list = numba.typed.List.empty_list(numba.types.int32[::1]) indices_list = numba.typed.List.empty_list(numba.types.int32[::1]) heads = numba.typed.List.empty_list(numba.types.int32[::1]) tails = numba.typed.List.empty_list(numba.types.int32[::1]) epochs_per_samples = numba.typed.List.empty_list( numba.types.float64[::1]) for mapper in self.mappers_: indptr_list.append(mapper.graph_.indptr) indices_list.append(mapper.graph_.indices) heads.append(mapper.graph_.tocoo().row) tails.append(mapper.graph_.tocoo().col) epochs_per_samples.append( make_epochs_per_sample(mapper.graph_.tocoo().data, n_epochs)) regularisation_weights = build_neighborhood_similarities( indptr_list, indices_list, relations, ) first_init = spectral_layout( self.mappers_[0]._raw_data, self.mappers_[0].graph_, self.n_components, np.random, ) expansion = 10.0 / np.abs(first_init).max() first_embedding = (first_init * expansion).astype( np.float32, order="C", ) embeddings = numba.typed.List.empty_list(numba.types.float32[:, ::1]) embeddings.append(first_embedding) for i in range(1, self.n_models_): next_init = spectral_layout( self.mappers_[i]._raw_data, self.mappers_[i].graph_, self.n_components, np.random, ) expansion = 10.0 / np.abs(next_init).max() next_embedding = (next_init * expansion).astype( np.float32, order="C", ) anchor_data = relations[i][window_size - 1] left_anchors = anchor_data[anchor_data >= 0] right_anchors = np.where(anchor_data >= 0)[0] embeddings.append( procrustes_align( embeddings[-1], next_embedding, np.vstack([left_anchors, right_anchors]), )) random_state = check_random_state(self.random_state) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) self.embeddings_ = optimize_layout_aligned_euclidean( embeddings, embeddings, heads, tails, n_epochs, epochs_per_samples, regularisation_weights, relations, rng_state, lambda_=self.alignment_regularisation, ) return self