Exemplo n.º 1
0
def search_models(tfidf_corpus, min_topics, max_topics, threshold=.333):

    g.debug("Building NMF topics...")
    # nmf_models = []
    costs = []
    H_similarities = []
    W_similarities = []
    tfidf_similarities = []
    max_strength = []
    min_strength = []
    avg_strength = []
    n_models = max_topics - min_topics + 1

    g.progress_bar(0, n_models)
    try:
        for i in range(min_topics, max_topics + 1):

            nmf, W, H = nmf_model(tfidf_corpus,
                                  i,
                                  max_iter=666,
                                  no_output=True)

            # nmf_models.append(nmf)
            costs.append(nmf.reconstruction_err_**2)
            H_similarities.append(
                1 - pairwise_distances(H, metric="cosine", n_jobs=-1).mean())
            W_similarities.append(
                1 - pairwise_distances(W, metric="cosine", n_jobs=-1).mean())
            W_normalized = W / W.max(axis=0)
            tfidf_similarities.append(
                np.mean([
                    pairwise_distances(
                        tfidf_corpus[W_normalized[:, topic_i] > threshold].A,
                        metric="cosine",
                        n_jobs=-1).mean() for topic_i in range(i)
                    if (W_normalized[:, topic_i] > threshold).any()
                ]))

            values = np.array([W[x, y] for x, y in np.transpose(W.nonzero())])
            max_strength.append(values.max())
            min_strength.append(values.min())
            avg_strength.append(values.mean())

            g.progress_bar(i - min_topics + 1,
                           n_models,
                           text=f"{nmf.n_iter_} iterations")

    except KeyboardInterrupt:
        completed = len(tfidf_similarities)
        costs = costs[:completed]
        H_similarities = H_similarities[:completed]
        W_similarities = W_similarities[:completed]
        max_strength = max_strength[:completed]
        min_strength = min_strength[:completed]
        avg_strength = avg_strength[:completed]

    return costs, H_similarities, W_similarities, tfidf_similarities, max_strength, min_strength, avg_strength
Exemplo n.º 2
0
def build_word_clouds(corpus_tfidf, corpus_topics, H, word_list, table_name):

    g.debug("Generating topic word clouds...")
    n_topics = H.shape[0]
    completed = 0
    g.progress_bar(completed, n_topics)

    topic_tfidf_weights = get_tfidf_topic_weights(corpus_tfidf, corpus_topics,
                                                  n_topics)
    topic_top_tfidf_words_i = np.argsort(topic_tfidf_weights, axis=1)[:, ::-1]
    topic_top_nmf_words_i = np.argsort(H, axis=1)[:, ::-1]

    for topic_i in range(n_topics):

        # nmf wordcloud
        wc = WordCloud(background_color="black",
                       max_words=333,
                       width=1000,
                       height=500)
        wc.fit_words({
            word_list[word_i]: H[topic_i, word_i]
            for word_i in topic_top_nmf_words_i[topic_i] if H[topic_i, word_i]
        })
        wc.to_file(f"../output/{table_name}/nmf/{topic_i}_nmf_wordcloud.png")

        # an empty topic...
        if not topic_tfidf_weights[topic_i].sum():
            continue

        # tf-idf wordcloud
        wc = WordCloud(background_color="black",
                       max_words=333,
                       width=1000,
                       height=500)
        wc.fit_words({
            word_list[word_i]: topic_tfidf_weights[topic_i, word_i]
            for word_i in topic_top_tfidf_words_i[topic_i]
            if topic_tfidf_weights[topic_i, word_i]
        })
        wc.to_file(f"../output/{table_name}/nmf/{topic_i}_tfidf_wordcloud.png")

        completed += 1
        g.progress_bar(completed, n_topics)

    g.debug(f" -> {n_topics} word clouds generated!", 1)
Exemplo n.º 3
0
def cache_wordclouds(corpus, vocabulary, H, W):

    n_topics = H.shape[0]
    g.debug(f"Caching word clouds for {n_topics} topics...")

    topic_tfidf_weights = get_tfidf_topic_weights(corpus.tfidf_corpus, W)

    total = n_topics * 2
    complete = 0
    g.progress_bar(complete, total)

    for topic_i in range(n_topics):
        # nmf wordcloud
        wc = build_word_cloud(H[topic_i], vocabulary)
        wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_nmf.png")
        complete += 1
        g.progress_bar(complete, total)

        # tfidf wordcloud
        if topic_tfidf_weights[topic_i].sum():
            wc = build_word_cloud(topic_tfidf_weights[topic_i], vocabulary)
        else:
            # an empty topic...
            wc = build_word_cloud([1], ["This topic was empty"])
        wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_tfidf.png")
        complete += 1
        g.progress_bar(complete, total)

    g.debug(" -> Done", 1)
Exemplo n.º 4
0
def sumarize_corpus(corpus, vectorizer, n_sentences=10):
    """
    Summarizes an entire corpus.  Displays a progress bar.
    :param corpus: The corpus to be summarized
    :param vectorizer: The TF-IDF vectorizer to be used for feature extraction.
    :param n_sentences: Number of sentences to include in the summary.
    :return: A corpus of summaries
    """

    g.debug("Summarizing documents...")

    summaries = []
    n_docs = len(corpus)
    completed = 0

    for doc in corpus:
        summaries.append(summarize_doc(doc, vectorizer, n_sentences))
        completed += 1
        g.progress_bar(completed, n_docs, 1)

    g.debug(f" -> {len(summaries)} documents summarized!", 1)
    return summaries