Exemplo n.º 1
0
    def get_topics(self) -> Dict[str, Tuple[str, float]]:
        """ Return topics with top n words and their c-TF-IDF score

        Usage:

        ```python
        all_topics = model.get_topics()
        ```
        """
        check_is_fitted(self)
        return self.topics
Exemplo n.º 2
0
    def reduce_topics(self,
                      docs: List[str],
                      topics: List[int],
                      probabilities: np.ndarray = None,
                      nr_topics: int = 20) -> Tuple[List[int], np.ndarray]:
        """ Further reduce the number of topics to nr_topics.

        The number of topics is further reduced by calculating the c-TF-IDF matrix
        of the documents and then reducing them by iteratively merging the least
        frequent topic with the most similar one based on their c-TF-IDF matrices.
        The topics, their sizes, and representations are updated.

        The reasoning for putting `docs`, `topics`, and `probs` as parameters is that
        these values are not saved within BERTopic on purpose. If you were to have a
        million documents, it seems very inefficient to save those in BERTopic
        instead of a dedicated database.

        Arguments:
            docs: The docs you used when calling either `fit` or `fit_transform`
            topics: The topics that were returned when calling either `fit` or `fit_transform`
            nr_topics: The number of topics you want reduced to
            probabilities: The probabilities that were returned when calling either `fit` or `fit_transform`

        Returns:
            new_topics: Updated topics
            new_probabilities: Updated probabilities

        Usage:

        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups

        # Create topics -> Typically over 50 topics
        docs = fetch_20newsgroups(subset='train')['data']
        model = BERTopic()
        topics, probs = model.fit_transform(docs)

        # Further reduce topics
        new_topics, new_probs = model.reduce_topics(docs, topics, probs, nr_topics=30)
        ```
        """
        check_is_fitted(self)
        self.nr_topics = nr_topics
        documents = pd.DataFrame({"Document": docs, "Topic": topics})

        # Reduce number of topics
        self._extract_topics(documents)
        documents = self._reduce_topics(documents)
        new_topics = documents.Topic.to_list()
        new_probabilities = self._map_probabilities(probabilities)

        return new_topics, new_probabilities
Exemplo n.º 3
0
    def get_topic(self, topic: int) -> Union[Dict[str, Tuple[str, float]], bool]:
        """ Return top n words for a specific topic and their c-TF-IDF scores

        Usage:

        ```python
        topic = model.get_topic(12)
        ```
        """
        check_is_fitted(self)
        if self.topics.get(topic):
            return self.topics[topic]
        else:
            return False
Exemplo n.º 4
0
    def update_topics(self,
                      docs: List[str],
                      topics: List[int],
                      n_gram_range: Tuple[int, int] = None,
                      stop_words: str = None,
                      vectorizer: CountVectorizer = None):
        """ Updates the topic representation by recalculating c-TF-IDF with the new
        parameters as defined in this function.

        When you have trained a model and viewed the topics and the words that represent them,
        you might not be satisfied with the representation. Perhaps you forgot to remove
        stop_words or you want to try out a different n_gram_range. This function allows you
        to update the topic representation after they have been formed.

        Args:
            docs: The docs you used when calling either `fit` or `fit_transform`
            topics: The topics that were returned when calling either `fit` or `fit_transform`
            n_gram_range: The n-gram range for the CountVectorizer.
            stop_words: Stopwords that can be used as either a list of strings, or the name of the
                        language as a string. For example: 'english' or ['the', 'and', 'I'].
                        Note that this will not be used if you pass in your own CountVectorizer.
            vectorizer: Pass in your own CountVectorizer from scikit-learn

        Usage:
        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups

        # Create topics
        docs = fetch_20newsgroups(subset='train')['data']
        model = BERTopic(n_gram_range=(1, 1), stop_words=None)
        topics, probs = model.fit_transform(docs)

        # Update topic representation
        model.update_topics(docs, topics, n_gram_range=(2, 3), stop_words="english")
        ```
        """
        check_is_fitted(self)
        if not n_gram_range:
            n_gram_range = self.n_gram_range

        if not stop_words:
            stop_words = self.stop_words

        self.vectorizer = vectorizer or CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words)
        documents = pd.DataFrame({"Document": docs, "Topic": topics})
        self._extract_topics(documents)
Exemplo n.º 5
0
    def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
        """ Return the the size of topics (descending order)

        Usage:

        To extract the frequency of all topics:

        ```python
        frequency = model.get_topic_freq()
        ```

        To get the frequency of a single topic:

        ```python
        frequency = model.get_topic_freq(12)
        ```
        """
        check_is_fitted(self)
        if isinstance(topic, int):
            return self.topic_sizes[topic]
        else:
            return pd.DataFrame(self.topic_sizes.items(), columns=['Topic', 'Count']).sort_values("Count",
                                                                                                  ascending=False)
Exemplo n.º 6
0
    def visualize_topics(self):
        """ Visualize topics, their sizes, and their corresponding words

        This visualization is highly inspired by LDAvis, a great visualization
        technique typically reserved for LDA.
        """
        check_is_fitted(self)
        if not _HAS_VIZ:
            raise ModuleNotFoundError(f"In order to use this function you'll need to install "
                                      f"additional dependencies;\npip install bertopic[visualization]")

        # Extract topic words and their frequencies
        topic_list = sorted(list(self.topics.keys()))
        frequencies = [self.topic_sizes[topic] for topic in topic_list]
        words = [" | ".join([word[0] for word in self.get_topic(topic)[:5]]) for topic in topic_list]

        # Embed c-TF-IDF into 2D
        embeddings = MinMaxScaler().fit_transform(self.c_tf_idf.toarray())
        embeddings = umap.UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings)

        # Visualize with plotly
        df = pd.DataFrame({"x": embeddings[1:, 0], "y": embeddings[1:, 1],
                           "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:]})
        self._plotly_topic_visualization(df, topic_list)
Exemplo n.º 7
0
    def visualize_distribution(self,
                               probabilities: np.ndarray,
                               min_probability: float = 0.015,
                               figsize: tuple = (10, 5),
                               save: bool = False):
        """ Visualize the distribution of topic probabilities

        Arguments:
            probabilities: An array of probability scores
            min_probability: The minimum probability score to visualize.
                             All others are ignored.
            figsize: The size of the figure
            save: Whether to save the resulting graph to probility.png

        Usage:

        Make sure to fit the model before and only input the
        probabilities of a single document:

        ```python
        model.visualize_distribution(probabilities[0])
        ```

        ![](../img/probabilities.png)
        """
        check_is_fitted(self)
        if not _HAS_VIZ:
            raise ModuleNotFoundError(f"In order to use this function you'll need to install "
                                      f"additional dependencies;\npip install bertopic[visualization]")
        if len(probabilities[probabilities > min_probability]) == 0:
            raise ValueError("There are no values where `min_probability` is higher than the "
                             "probabilities that were supplied. Lower `min_probability` to prevent this error.")
        if not self.calculate_probabilities:
            raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False "
                             "as it uses the topic probabilities. ")

        # Get values and indices equal or exceed the minimum probability
        labels_idx = np.argwhere(probabilities >= min_probability).flatten()
        vals = probabilities[labels_idx].tolist()

        # Create labels
        labels = []
        for idx in labels_idx:
            label = []
            words = self.get_topic(idx)
            if words:
                for word in words[:5]:
                    label.append(word[0])
                label = str(r"$\bf{Topic }$ " +
                            r"$\bf{" + str(idx) + ":}$ " +
                            " ".join(label))
                labels.append(label)
            else:
                print(idx, probabilities[idx])
                vals.remove(probabilities[idx])
        pos = range(len(vals))

        # Create figure
        fig, ax = plt.subplots(figsize=figsize)
        plt.hlines(y=pos, xmin=0, xmax=vals, color='#333F4B', alpha=0.2, linewidth=15)
        plt.hlines(y=np.argmax(vals), xmin=0, xmax=max(vals), color='#333F4B', alpha=1, linewidth=15)

        # Set ticks and labels
        ax.tick_params(axis='both', which='major', labelsize=12)
        ax.set_xlabel('Probability', fontsize=15, fontweight='black', color='#333F4B')
        ax.set_ylabel('')
        plt.yticks(pos, labels)
        fig.text(0, 1, 'Topic Probability Distribution', fontsize=15, fontweight='black', color='#333F4B')

        # Update spine style
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_bounds(pos[0], pos[-1])
        ax.spines['bottom'].set_bounds(0, max(vals))
        ax.spines['bottom'].set_position(('axes', -0.02))
        ax.spines['left'].set_position(('axes', 0.02))

        fig.tight_layout()

        if save:
            fig.savefig("probability.png", dpi=300, bbox_inches='tight')
Exemplo n.º 8
0
    def transform(self,
                  documents: Union[str, List[str]],
                  embeddings: np.ndarray = None) -> Tuple[List[int], np.ndarray]:
        """ After having fit a model, use transform to predict new instances

        Arguments:
            documents: A single document or a list of documents to fit on
            embeddings: Pre-trained document embeddings. These can be used
                        instead of the sentence-transformer model.

        Returns:
            predictions: Topic predictions for each documents
            probabilities: The topic probability distribution

        Usage:

        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups

        docs = fetch_20newsgroups(subset='all')['data']
        model = BERTopic("distilbert-base-nli-mean-tokens", verbose=True).fit(docs)
        topics = model.transform(docs)
        ```

        If you want to use your own embeddings:

        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups
        from sentence_transformers import SentenceTransformer

        # Create embeddings
        docs = fetch_20newsgroups(subset='all')['data']
        sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
        embeddings = sentence_model.encode(docs, show_progress_bar=True)

        # Create topic model
        model = BERTopic(None, verbose=True).fit(docs, embeddings)
        topics = model.transform(docs, embeddings)
        ```
        """
        check_is_fitted(self)
        check_embeddings_shape(embeddings, documents)

        if isinstance(documents, str):
            documents = [documents]

        if not isinstance(embeddings, np.ndarray):
            embeddings = self._extract_embeddings(documents)

        umap_embeddings = self.umap_model.transform(embeddings)
        predictions, _ = hdbscan.approximate_predict(self.cluster_model, umap_embeddings)

        if self.calculate_probabilities:
            probabilities = hdbscan.membership_vector(self.cluster_model, umap_embeddings)
            if len(documents) == 1:
                probabilities = probabilities.flatten()
        else:
            probabilities = None

        if self.mapped_topics:
            predictions = self._map_predictions(predictions)
            probabilities = self._map_probabilities(probabilities)

        return predictions, probabilities