def test_hdbscan_membership_vector(): clusterer = HDBSCAN(prediction_data=True).fit(X) vector = membership_vector(clusterer, np.array([[-1.5, -1.0]])) assert_array_almost_equal(vector, np.array([[0.05705305, 0.05974177, 0.12228153]])) vector = membership_vector(clusterer, np.array([[1.5, -1.0]])) assert_array_almost_equal(vector, np.array([[0.09462176, 0.32061556, 0.10112905]])) vector = membership_vector(clusterer, np.array([[0.0, 0.0]])) assert_array_almost_equal(vector, np.array([[0.03545607, 0.03363318, 0.04643177]]))
def transform( self, documents: Union[str, List[str]]) -> Tuple[List[int], np.ndarray]: """ After having fit a model, use transform to predict new instances Arguments: documents: A single document or a list of documents to fit on Returns: predictions: Topic predictions for each documents probabilities: The topic probability distribution """ if isinstance(documents, str): documents = [documents] embeddings = self._extract_embeddings(documents) umap_embeddings = self.umap_model.transform(embeddings) probabilities = hdbscan.membership_vector(self.cluster_model, umap_embeddings) predictions, _ = hdbscan.approximate_predict(self.cluster_model, umap_embeddings) if self.mapped_topics: predictions = self._map_predictions(predictions) probabilities = self._map_probabilities(probabilities) if len(documents) == 1: probabilities = probabilities.flatten() return predictions, probabilities
def classify_tracks(self): finder = CategoryFinder() finder.parse_categories("Classical", 5) sum_pct = 0 for _id in finder.artists: artist = finder.artists[_id] recs = [] for recording in artist['recordings']: recs.append(self.data.get_feature_vector(recording)['mfcc']) # test_labels, strengths = hd.approximate_predict(self.clusterer, recs) memb_vec = hd.membership_vector(self.clusterer, np.array(recs)) classified = [np.argmax(v) for v in memb_vec] pct = [c for c in classified if c == 0 or c == 1] sum_pct += (float(len(pct)) / len(classified)) print(artist['name'], float(len(pct)) / len(classified)) print(sum_pct / len(finder.artists))
def classify_artists(self): finder = CategoryFinder() finder.parse_categories("Classical", 5) aggrs = [] names = [] for _id in finder.artists: artist = self.data.get_doc(_id) artist["aggregates"] = self.data.aggregate_features( artist["recordings"]) aggrs.append(artist["aggregates"]["median"]) names.append(artist["name"]) # test_labels, strengths = hd.approximate_predict(self.clusterer, aggrs) memb_vec = hd.membership_vector(self.clusterer, np.array(aggrs)) classified = [np.argmax(v) for v in memb_vec] matched = [1 for a in classified if a == 0 or a == 1] for i, name in enumerate(names): # print(name, test_labels[i], strengths[i]) print(name, classified[i]) print(len(matched) / float(len(classified)))
def hdbscan_predict(embedding, df_scaled, clusterer, force_predict=True): if force_predict: mem_vec = pd.DataFrame(hdbscan.membership_vector(clusterer, embedding.values)) test_labels = mem_vec.idxmax(axis=1).to_numpy() strengths = mem_vec.max(axis=1).to_numpy() else: test_labels, strengths = hdbscan.approximate_predict(clusterer, embedding) # Get probabilities scores = pd.DataFrame(strengths) scores.columns = ['score'] # Get clusters labels = pd.DataFrame(test_labels) labels.columns = ['cluster'] # Join scores = scores.join(labels).join(embedding).join(df_scaled) n_clusters = sum(scores['cluster'].unique()!=-1) scores['cluster'].value_counts() return(scores)
def transform(self, documents: Union[str, List[str]], embeddings: np.ndarray = None) -> Tuple[List[int], np.ndarray]: """ After having fit a model, use transform to predict new instances Arguments: documents: A single document or a list of documents to fit on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model. Returns: predictions: Topic predictions for each documents probabilities: The topic probability distribution Usage: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all')['data'] model = BERTopic("distilbert-base-nli-mean-tokens", verbose=True).fit(docs) topics = model.transform(docs) ``` If you want to use your own embeddings: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Create embeddings docs = fetch_20newsgroups(subset='all')['data'] sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens") embeddings = sentence_model.encode(docs, show_progress_bar=True) # Create topic model model = BERTopic(None, verbose=True).fit(docs, embeddings) topics = model.transform(docs, embeddings) ``` """ if isinstance(documents, str): documents = [documents] if not isinstance(embeddings, np.ndarray): check_embeddings_shape(embeddings, documents) embeddings = self._extract_embeddings(documents) umap_embeddings = self.umap_model.transform(embeddings) probabilities = hdbscan.membership_vector(self.cluster_model, umap_embeddings) predictions, _ = hdbscan.approximate_predict(self.cluster_model, umap_embeddings) if self.mapped_topics: predictions = self._map_predictions(predictions) probabilities = self._map_probabilities(probabilities) if len(documents) == 1: probabilities = probabilities.flatten() return predictions, probabilities
def membership_vector(self, X): return hdbscan.membership_vector(self.hdbscan, X)