예제 #1
0
    def test_tdf_search_corpus(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text")
        results = tdf.search_corpus("movie")
        self.assertEqual(
            len(results[results["search_cosine_similarity"] > 0.2]), 5)
예제 #2
0
    def test_tdf_extract_corpus_fragments(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df[:100], "text")
        fragments = tdf.extract_corpus_fragments(scan_top_n_matches_per_doc=1,
                                                 min_fragment_length=3)
        self.assertEqual(len(fragments), 1)
        self.assertEqual(fragments[0], "s .")
예제 #3
0
    def test_tdf_kmeans_clusters(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        tdf.kmeans_clusters(k=2)
        terms = tdf.top_cluster_terms("kmeans")
        self.assertEqual(len(terms.keys()), 2)
        self.assertIn(terms[1][0], ["alien", "husband"])
        self.assertIn(terms[0][0], ["alien", "husband"])
예제 #4
0
    def test_tdf_hdbscan_clusters(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        tdf.hdbscan_clusters(min_cluster_size=10)
        terms = tdf.top_cluster_terms("hdbscan")
        self.assertEqual(len(terms.keys()), 3)
        self.assertEqual(terms[-1][0], "mike")
        self.assertEqual(terms[18][0], "disney")
        self.assertEqual(terms[11][0], "jackie")
예제 #5
0
    def test_tdf_pca_components(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        tdf.pca_components(k=5)
        docs = tdf.get_top_documents(component_prefix="pca", top_n=2)
        self.assertEqual(docs["pca_0"][0][:10], "there must")
        self.assertEqual(docs["pca_1"][0][:10], "plot : a d")
        self.assertEqual(docs["pca_2"][0][:10], "with the s")
        self.assertIn(docs["pca_3"][0][:10], ["every once", " * * * * *"])
        self.assertEqual(docs["pca_4"][0][:10], "when i fir")
예제 #6
0
    def test_make_document_cooccurrence_matrix(self):

        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        mat = tdf.make_document_cooccurrence_matrix(normalize=False)
        self.assertTrue(len(mat) == len(self.df))
        self.assertTrue(mat.max().max() > 1.0)
        mat = tdf.make_document_cooccurrence_matrix(normalize=True)
        self.assertTrue(len(mat) == len(self.df))
        self.assertTrue(mat.max().max() == 1.0)
예제 #7
0
    def test_tdf_lsa_components(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        tdf.lsa_components(k=5)
        docs = tdf.get_top_documents(component_prefix="lsa", top_n=2)
        self.assertEqual(docs["lsa_0"][0][:10], " * * * the")
        self.assertEqual(docs["lsa_1"][0][:10], "susan gran")
        self.assertEqual(len(docs["lsa_2"]), 0)
        self.assertIn(docs["lsa_3"][0][:10], ["as a devou", "every once"])
        self.assertEqual(docs["lsa_4"][0][:10], "when i fir")
예제 #8
0
    def test_compute_hdbscan_clusters(self):
        from pewanalytics.stats.clustering import compute_hdbscan_clusters
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        hdbscan = compute_hdbscan_clusters(tdf.tfidf, min_cluster_size=10)
        self.assertEqual(len(hdbscan), 2000)
        self.assertEqual(len(set(hdbscan)), 23)
예제 #9
0
    def test_mutual_info_bar_plot(self):

        from pewanalytics.text import TextDataFrame
        from pewanalytics.stats.mutual_info import mutual_info_bar_plot
        import matplotlib.pyplot as plt

        self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int)
        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        mutual_info = tdf.mutual_info("outcome")
        plot = mutual_info_bar_plot(
            mutual_info,
            filter_col="pct_term_pos_neg_ratio",
            top_n=20,
            x_col="pct_term_pos_neg_ratio",
        )
        # plt.show()
        # self.assertEqual(str(plot.__hash__()), '-9223372036574337697')
        # TODO: figure out how to get a unique representation of the plot
        self.assertTrue(True)
예제 #10
0
    def test_tdf_find_related_keywords(self):

        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(
            self.df,
            "text",
            min_df=10,
            max_df=0.95,
            use_idf=False,
            binary=True,
            sublinear_tf=False,
            smooth_idf=False,
            norm=None,
        )
        terms = tdf.find_related_keywords("disney", n=25)
        for term in [
                "animation", "mulan", "mermaid", "hercules", "tarzan", "pixar"
        ]:
            self.assertIn(term, terms)
예제 #11
0
    def test_mutual_info(self):

        from pewanalytics.text import TextDataFrame
        from pewanalytics.stats.mutual_info import compute_mutual_info

        self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int)
        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        tdf.corpus["weight"] = 1.0

        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf,
                                          weights=None)
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf,
                                          weights=tdf.corpus["weight"])
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf,
                                          normalize=False)
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf,
                                          l=1)
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf.todense(),
                                          weights=None)
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf.todense(),
                                          weights=tdf.corpus["weight"])
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf.todense(),
                                          normalize=False)
        self.assertIsNotNone(mutual_info)
        mutual_info = compute_mutual_info(tdf.corpus["outcome"],
                                          tdf.tfidf.todense(),
                                          l=1)
        self.assertIsNotNone(mutual_info)
예제 #12
0
    def test_correspondence_analysis(self):
        from pewanalytics.stats.dimensionality_reduction import correspondence_analysis
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        matrix = pd.DataFrame(tdf.tfidf.todense(),
                              columns=tdf.vectorizer.get_feature_names())
        mca = correspondence_analysis(matrix)
        self.assertAlmostEqual(mca["mca_1"].values[0], 0.59554, 4)
        self.assertEqual(mca["node"].values[0], "over")
        self.assertAlmostEqual(mca["mca_1"].values[-1], -0.4274, 4)
        self.assertEqual(mca["node"].values[-1], "red")
예제 #13
0
    def test_compute_kmeans_clusters(self):
        from pewanalytics.stats.clustering import compute_kmeans_clusters
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        kmeans = compute_kmeans_clusters(tdf.tfidf, k=2, return_score=False)
        self.assertEqual(len(kmeans), 2000)
        self.assertEqual(len(set(kmeans)), 2)
        kmeans, score = compute_kmeans_clusters(tdf.tfidf,
                                                k=2,
                                                return_score=True)
        self.assertEqual(len(kmeans), 2000)
        self.assertEqual(len(set(kmeans)), 2)
        self.assertGreater(score, 0)
예제 #14
0
    def test_make_word_cooccurrence_matrix(self):

        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)

        from sklearn.feature_extraction.text import CountVectorizer

        cv = CountVectorizer(ngram_range=(1, 1),
                             stop_words="english",
                             min_df=10,
                             max_df=0.5)
        cv.fit_transform(self.df["text"])
        vocab = cv.get_feature_names()
        mat = tdf.make_word_cooccurrence_matrix(normalize=False,
                                                min_frequency=10,
                                                max_frequency=0.5)
        self.assertTrue(len(mat) == len(vocab))
        self.assertTrue(mat.max().max() > 1.0)
        mat = tdf.make_word_cooccurrence_matrix(normalize=True,
                                                min_frequency=10,
                                                max_frequency=0.5)
        self.assertTrue(len(mat) == len(vocab))
        self.assertTrue(mat.max().max() == 1.0)
예제 #15
0
    def test_tdf_match_text_to_corpus(self):
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(
            pd.DataFrame([
                {
                    "text": "I read books"
                },
                {
                    "text": "I like reading"
                },
                {
                    "text": "I read books"
                },
                {
                    "text": "reading is nice"
                },
                {
                    "text": "reading"
                },
                {
                    "text": "books"
                },
            ]),
            "text",
        )
        matches = tdf.match_text_to_corpus(["books", "reading"],
                                           min_similarity=0.1,
                                           allow_multiple=True)
        self.assertEqual(
            list(matches["match_text"].values),
            ["books", "reading", "books", "reading", "reading", "books"],
        )
        matches = tdf.match_text_to_corpus(["books", "reading"],
                                           min_similarity=0.5,
                                           allow_multiple=True)
        self.assertEqual(
            list(matches["match_text"].values),
            ["books", "reading", "books", None, "reading", "books"],
        )
        matches = tdf.match_text_to_corpus(["books", "reading"],
                                           min_similarity=0.6,
                                           allow_multiple=True)
        self.assertEqual(
            list(matches["match_text"].values),
            ["books", None, "books", None, "reading", "books"],
        )
        matches = tdf.match_text_to_corpus(["books", "reading"],
                                           min_similarity=0.5,
                                           allow_multiple=False)
        self.assertEqual(
            list(matches["match_text"].values),
            [None, None, None, None, "reading", "books"],
        )
예제 #16
0
    def test_tdf_find_duplicates(self):
        from pewanalytics.text import TextDataFrame

        self.df["text"] = self.df["text"].map(lambda x: x[:1000])
        tdf = TextDataFrame(self.df, "text")
        dupes = tdf.find_duplicates(tfidf_threshold=0.8,
                                    fuzzy_ratio_threshold=80,
                                    allow_partial=False)
        self.assertEqual(len(dupes), 6)
        self.df["text"] = self.df["text"].map(lambda x: x[:-400]
                                              if random.random() > 0.5 else x)
        tdf = TextDataFrame(self.df, "text")
        dupes = tdf.find_duplicates(tfidf_threshold=0.6,
                                    fuzzy_ratio_threshold=80,
                                    allow_partial=True)
        self.assertEqual(len(dupes), 7)
예제 #17
0
    def test_get_lsa(self):
        from pewanalytics.stats.dimensionality_reduction import get_lsa
        from pewanalytics.text import TextDataFrame

        tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5)
        components, results = get_lsa(tdf.tfidf, k=5)
        component_means = components.mean().to_dict()
        result_means = results.mean().to_dict()
        self.assertEqual(components.shape[0], 2075)
        self.assertEqual(components.shape[1], 5)
        self.assertEqual(results.shape[0], 2000)
        self.assertEqual(results.shape[1], 6)
        self.assertAlmostEqual(component_means["lsa_0"], 0.0174, 2)
        self.assertAlmostEqual(component_means["lsa_1"], -0.0002, 2)
        self.assertAlmostEqual(component_means["lsa_2"], -0.0030, 2)
        self.assertAlmostEqual(component_means["lsa_3"], -0.0011, 2)
        self.assertAlmostEqual(component_means["lsa_4"], -0.0002, 2)
        self.assertAlmostEqual(result_means["lsa_0"], 0.3025, 2)
        self.assertAlmostEqual(result_means["lsa_1"], 0.001, 2)
        self.assertAlmostEqual(result_means["lsa_2"], -0.0034, 2)
        self.assertAlmostEqual(result_means["lsa_3"], -0.0022, 2)
        self.assertAlmostEqual(result_means["lsa_4"], -0.0, 2)
예제 #18
0
    def test_tdf_mutual_info(self):

        from pewanalytics.text import TextDataFrame

        self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int)
        self.df["text"] = self.df.apply(
            lambda x: "{} always_pos".format(x["text"])
            if x["outcome"] else x["text"],
            axis=1,
        )
        tdf = TextDataFrame(
            self.df,
            "text",
            min_df=50,
            max_df=0.5,
            use_idf=False,
            binary=True,
            sublinear_tf=False,
            smooth_idf=False,
            norm=None,
        )
        # games occurs 24 times in the pos class, 26 times in the neg class; total is 50
        # overall document total is 2000 (1000 pos)
        px1y1 = 24.0 / 2000.0
        px1y0 = 26.0 / 2000.0
        px1 = 50.0 / 2000.0
        px0 = (2000.0 - 50.0) / 2000.0
        py1 = 1000.0 / 2000.0

        mutual_info = tdf.mutual_info("outcome", normalize=False)
        MI1 = math.log(px1y1 / (px1 * py1), 2)
        MI1_alt = math.log(px1y1, 2) - math.log(px1, 2) - math.log(py1, 2)
        self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1, 4)
        self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_alt, 4)

        mutual_info = tdf.mutual_info("outcome", normalize=True)
        MI1_norm = MI1 / (-1 * math.log(px1y1, 2))
        MI1_norm_alt = (math.log(px1 * py1, 2) / math.log(px1y1, 2)) - 1.0
        self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_norm, 4)
        self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_norm_alt,
                               4)

        pos = mutual_info.sort_values("MI1", ascending=False)[:10]
        neg = mutual_info.sort_values("MI0", ascending=False)[:10]

        self.assertEqual(pos.index[0], "always_pos")
        self.assertEqual(pos.iloc[0]["MI1"], 1.0)
        self.assertEqual(pos.index[1], "outstanding")
        for field, val in [
            ("MI1", 0.178374),
            ("MI0", -0.319942),
            ("total", 68.0),
            ("total_pos_with_term", 63.0),
            ("total_neg_with_term", 5.0),
            ("total_pos_neg_with_term_diff", 58.0),
            ("pct_pos_with_term", 0.063),
            ("pct_neg_with_term", 0.005),
            ("pct_pos_neg_with_term_diff", 0.058),
            ("pct_pos_neg_with_term_ratio", 12.6),
            ("pct_term_pos", 0.926471),
            ("pct_term_neg", 0.073529),
            ("pct_term_pos_neg_diff", 0.852941),
            ("pct_term_pos_neg_ratio", 12.6),
        ]:
            self.assertAlmostEqual(pos.iloc[1][field], val, 4)

        self.assertEqual(neg.index[0], "bad")
        for field, val in [
            ("MI1", -0.195836),
            ("MI0", 0.209830),
            ("total", 773.0),
            ("total_pos_with_term", 259.0),
            ("total_neg_with_term", 514.0),
            ("total_pos_neg_with_term_diff", -255.0),
            ("pct_pos_with_term", 0.259),
            ("pct_neg_with_term", 0.514),
            ("pct_pos_neg_with_term_diff", -0.255),
            ("pct_pos_neg_with_term_ratio", 0.503891),
            ("pct_term_pos", 0.335058),
            ("pct_term_neg", 0.664942),
            ("pct_term_pos_neg_diff", -0.329884),
            ("pct_term_pos_neg_ratio", 0.503891),
        ]:
            self.assertAlmostEqual(neg.iloc[0][field], val, 4)