Exemplo n.º 1
0
 def compare_unigram_probabilities_intersection(self) -> None:
     diff = unigram_probabilities_difference(
         unigram_probabilities(gen_sentences(
             os.path.join("test_data", "ladygaga.txt")),
                               lower=True),
         unigram_probabilities(gen_sentences(
             os.path.join("test_data", "KingJames.txt")),
                               lower=True),
         intersection_only=True,
     )
     self.assertAlmostEqual(0.01179526361706696, diff["#"])
     self.assertAlmostEqual(0.011647311103199175, diff["the"])
     self.assertAlmostEqual(0.005001338537094464, diff["love"])
Exemplo n.º 2
0
 def compare_unigram_probabilities(self) -> None:
     diff = unigram_probabilities_difference(
         unigram_probabilities(gen_sentences(
             os.path.join("test_data", "ladygaga.txt")),
                               lower=True),
         unigram_probabilities(gen_sentences(
             os.path.join("test_data", "KingJames.txt")),
                               lower=True),
         intersection_only=False,
     )
     self.assertAlmostEqual(0.01179526361706696, diff["#"])
     self.assertAlmostEqual(0.011647311103199175, diff["the"])
     self.assertAlmostEqual(0.008741258741258742, diff["chromatica"])
Exemplo n.º 3
0
 def compare_bigram_probabilities(self) -> None:
     bigram_diff = bigram_probabilities_difference(
         bigram_probabilities(gen_sentences(
             os.path.join("test_data", "ladygaga.txt")),
                              lower=True),
         bigram_probabilities(gen_sentences(
             os.path.join("test_data", "lizzo.txt")),
                              lower=True),
         "i",
         intersection_only=False,
     )
     self.assertAlmostEqual(0.08695652173913043, bigram_diff["want"])
     self.assertAlmostEqual(0.043478260869565216, bigram_diff["wish"])
Exemplo n.º 4
0
 def compare_bigram_probabilities_intersection(self) -> None:
     bigram_diff = bigram_probabilities_difference(
         bigram_probabilities(gen_sentences(
             os.path.join("test_data", "lizzo.txt")),
                              lower=True),
         bigram_probabilities(gen_sentences(
             os.path.join("test_data", "ladygaga.txt")),
                              lower=True),
         "i",
         intersection_only=True,
     )
     self.assertAlmostEqual(0.014624505928853754, bigram_diff["hope"])
     self.assertAlmostEqual(0.011462450592885365, bigram_diff["love"])
Exemplo n.º 5
0
def explore_mini_corpus():
    # Use this space to explore and compare the documents in the mini-corpus provided

    # Brown samples by genre
    news_sents = list(
        gen_sentences(os.path.join("test_data", "brown-news.txt")))
    humor_sents = list(
        gen_sentences(os.path.join("test_data", "brown-humor.txt")))
    sci_fi_sents = list(
        gen_sentences(os.path.join("test_data", "brown-science_fiction.txt")))
    romance_sents = list(
        gen_sentences(os.path.join("test_data", "brown-romance.txt")))

    # Tweets
    ariana_sents = list(
        gen_sentences(os.path.join("test_data", "ArianaGrande.txt")))
    cristiano_sents = list(
        gen_sentences(os.path.join("test_data", "Cristiano.txt")))
    kingjames_sents = list(
        gen_sentences(os.path.join("test_data", "KingJames.txt")))
    gaga_sents = list(gen_sentences(os.path.join("test_data", "ladygaga.txt")))
    lizzo_sents = list(gen_sentences(os.path.join("test_data", "lizzo.txt")))

    # Examples:
    print("Lady Gaga")
    for word, count in Counter(
            unigram_probabilities(gaga_sents)).most_common(10):
        print(f"{word}: {count:.3f}")
    print()
    print("\nLebron James")
    for word, count in Counter(
            unigram_probabilities(kingjames_sents)).most_common(10):
        print(f"{word}: {count:.3f}")
    print()

    print("Difference between Lebron and Gaga unigram probabilities")
    diff = unigram_probabilities_difference(
        unigram_probabilities(gaga_sents, lower=True),
        unigram_probabilities(kingjames_sents, lower=True),
        intersection_only=False,
    )
    for word, val in Counter(diff).most_common(20):
        print(f"{word}: {val}")
    print()
    print(
        'Difference between probabilities of word following "I" for Gaga and Lizzo'
    )
    bigram_diff = bigram_probabilities_difference(
        bigram_probabilities(lizzo_sents, lower=True),
        bigram_probabilities(gaga_sents, lower=True),
        "i",
        intersection_only=True,
    )
    for word, val in Counter(bigram_diff).most_common(20):
        print(f"{word}: {val}")
Exemplo n.º 6
0
    def test_all(self) -> None:
        """Test all of gen_sentences."""
        # Test type
        gen = gen_sentences(
            os.path.join("test_data", "hw1_tokenized_text_1.txt"))
        self.assertEqual(GeneratorType, type(gen))

        # Test basic
        gen = gen_sentences(
            os.path.join("test_data", "hw1_tokenized_text_1.txt"))
        self.assertEqual(
            ["Tokenized", "text", "is", "easy", "to", "work", "with", "."],
            next(gen))
        self.assertEqual(["Writing", "a", "tokenizer", "is", "a", "pain", "."],
                         next(gen))
        with self.assertRaises(StopIteration):
            next(gen)

        # Test advanced
        gen = gen_sentences(
            os.path.join("test_data", "hw1_tokenized_text_2.txt"))
        self.assertEqual(["Hello", ",", "world", "!"], next(gen))
        # Between these sentences, there is a line in the file with a single space,
        # which should be skipped over.
        self.assertEqual(["This", "is", "a", "normal", "sentence", "."],
                         next(gen))
        self.assertEqual(
            [
                '"',
                "I",
                "don't",
                "like",
                "it",
                "when",
                "there's",
                "too",
                "much",
                "punctuation",
                "!",
                '"',
                ",",
                "they",
                "exclaimed",
                ".",
            ],
            next(gen),
        )
        with self.assertRaises(StopIteration):
            next(gen)
Exemplo n.º 7
0
 def test_count_unigrams_type(self) -> None:
     """Test count unigrams type"""
     gen = gen_sentences(
         os.path.join("test_data", "hw1_tokenized_text_1.txt"))
     counts = count_unigrams(gen)
     for k in counts:
         self.assertEqual(str, type(k))
Exemplo n.º 8
0
 def test_type_unigram(self) -> None:
     """Test that a generator is returned."""
     gen = gen_unigrams(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_1.txt")))
     self.assertEqual(GeneratorType, type(gen))
     self.assertEqual(str, type(next(gen)))
Exemplo n.º 9
0
 def test_trigram_probabilities_type(self) -> None:
     probs = trigram_probabilities(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertEqual(defaultdict, type(probs))
Exemplo n.º 10
0
 def test_bigram_frequency_dist(self) -> None:
     dist = bigram_freq_dist(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")))
     self.assertEqual(2, dist["The"]["cat"])
     self.assertEqual(3, dist["The"]["dog"])
     self.assertEqual(1, dist["the"]["cat"])
     self.assertEqual(7, dist["."][END_TOKEN0])
Exemplo n.º 11
0
 def test_trigram_frequency_dist(self) -> None:
     dist = trigram_freq_dist(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")))
     self.assertEqual(2, dist[("The", "dog")]["drank"])
     self.assertEqual(1, dist[("squirrel", "ate")]["peanuts"])
     self.assertEqual(7, dist[(".", END_TOKEN0)][END_TOKEN1])
     self.assertEqual(7, dist[(START_TOKEN1, START_TOKEN0)]["The"])
     self.assertEqual(3, sum(dist[("The", "dog")].values()))
Exemplo n.º 12
0
 def test_count_trigrams_type(self) -> None:
     """Test trigrams are tuples with 3 strings"""
     # assert case_sarcastically("hello, friend!") == "hElLo, FrIeNd!"
     gen = gen_sentences(
         os.path.join("test_data", "hw1_tokenized_text_1.txt"))
     counts = count_trigrams(gen)
     for k in counts:
         self.assertEqual(tuple, type(k))
         self.assertEqual(3, len(k))
Exemplo n.º 13
0
 def test_count_trigrams_lower(self) -> None:
     """Test count trigrams with lowercasing."""
     trigrams = count_trigrams(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertEqual(0, trigrams[(START_TOKEN1, START_TOKEN0, "The")])
     self.assertEqual(7, trigrams[(START_TOKEN1, START_TOKEN0, "the")])
Exemplo n.º 14
0
 def test_count_trigrams(self) -> None:
     """Test count trigrams with casing."""
     trigrams = count_trigrams(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")))
     self.assertEqual(2, trigrams[("The", "dog", "drank")])
     self.assertEqual(1, trigrams[("squirrel", "ate", "peanuts")])
     self.assertEqual(7, trigrams[(".", END_TOKEN0, END_TOKEN1)])
     self.assertEqual(7, trigrams[(START_TOKEN1, START_TOKEN0, "The")])
Exemplo n.º 15
0
 def test_unigram_probabilities(self) -> None:
     probs = unigram_probabilities(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertAlmostEqual(0.26315789, probs["the"])
     self.assertAlmostEqual(0.07894736, probs["dog"])
     self.assertAlmostEqual(0.02631578, probs["pizza"])
     self.assertEqual(0, probs["cookies"])
Exemplo n.º 16
0
 def test_trigram_frequency_dist_lower(self) -> None:
     dist = trigram_freq_dist(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertEqual(0, dist[("The", "cat")]["ate"])
     self.assertEqual(2, dist[("the", "dog")]["drank"])
     self.assertEqual(1, dist[("the", "cat")]["ate"])
     self.assertEqual(3, sum(dist[("the", "cat")].values()))
Exemplo n.º 17
0
 def test_count_unigrams_lower(self) -> None:
     """Test count unigrams with lowercase option=True"""
     unigrams = count_unigrams(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertEqual(10, unigrams["the"])
     self.assertEqual(3, unigrams["cat"])
     self.assertEqual(3, unigrams["dog"])
     self.assertEqual(7, unigrams["."])
     self.assertEqual(1, unigrams["pizza"])
Exemplo n.º 18
0
    def test_count_bigrams(self) -> None:
        """Test count bigrams with case"""
        bigrams = count_bigrams(
            gen_sentences(os.path.join("test_data",
                                       "hw1_tokenized_text_3.txt")))

        self.assertEqual(3, bigrams[("The", "dog")])
        self.assertEqual(2, bigrams[("squirrel", "ate")])
        self.assertEqual(2, bigrams[("The", "cat")])
        self.assertEqual(1, bigrams[("the", "cat")])
        self.assertEqual(1, bigrams[("drank", "coffee")])
        self.assertEqual(7, bigrams[".", END_TOKEN0])
Exemplo n.º 19
0
 def test_trigram_probabilities(self) -> None:
     probs = trigram_probabilities(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertAlmostEqual(0.5, probs[("dog", "drank")]["coffee"])
     self.assertAlmostEqual(
         0.4285714285,
         probs[(START_TOKEN0, "the")]["dog"],
     )
     self.assertAlmostEqual(0.5, probs[("squirrel", "ate")]["peanuts"])
     self.assertEqual(0, probs[("cookies", "are")]["good"])
Exemplo n.º 20
0
 def test_bigram_probabilities(self) -> None:
     probs = bigram_probabilities(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_3.txt")),
         lower=True,
     )
     self.assertAlmostEqual(0.3, probs["the"]["dog"])
     self.assertEqual(
         1,
         probs[START_TOKEN0]["the"],
     )
     self.assertEqual(1, probs["squirrel"]["ate"])
     self.assertEqual(0, probs["cookies"]["are"])
Exemplo n.º 21
0
    def test_count_unigrams(self) -> None:
        """Test count unigrams with case"""
        unigrams = count_unigrams(
            gen_sentences(os.path.join("test_data",
                                       "hw1_tokenized_text_3.txt")))

        self.assertEqual(7, unigrams["The"])
        self.assertEqual(3, unigrams["dog"])
        self.assertEqual(
            3,
            unigrams["cat"],
        )
        self.assertEqual(3, unigrams["the"])
        self.assertEqual(7, unigrams["."])
        self.assertEqual(1, unigrams["pizza"])
Exemplo n.º 22
0
 def test_type_trigrams(self) -> None:
     gen = gen_trigrams(
         gen_sentences(os.path.join("test_data",
                                    "hw1_tokenized_text_1.txt")))
     self.assertEqual(GeneratorType, type(gen))
     self.assertEqual(tuple, type(next(gen)))
Exemplo n.º 23
0
def debug_functions():
    # Prints out small output for each function in hw1
    # You can modify this to debug your functions

    # Generate sentences by loading from test data
    # Store in a list since will use multiple times
    news_sents = list(
        gen_sentences(os.path.join("test_data", "brown-news.txt")))

    # Generate unigrams, bigrams, trigrams --------------------------------------------------------
    print("----------N-gram Generators----------")
    unigrams = list(gen_unigrams(news_sents))
    bigrams = list(gen_bigrams(news_sents))
    trigrams = list(gen_trigrams(news_sents))
    print(f"Unigrams:\n {unigrams[:8]}\n")
    print(f"Bigrams:\n {bigrams[:8]}\n")
    print(f"Trigrams:\n {trigrams[:8]}\n")
    print()

    # Counts --------------------------------------------------------------------------------------
    print("----------Counts----------")
    print("--Unigram Counts--")
    unigram_counts = count_unigrams(news_sents)
    for gram in list(unigram_counts)[:3]:
        print(f"{gram}: {unigram_counts[gram]}")
    print()
    print("--Bigram Counts--")
    bigram_counts = count_bigrams(news_sents)
    for gram in list(bigram_counts)[:3]:
        print(f"{gram}: {bigram_counts[gram]}")
    print()
    print("--Trigram Counts--")
    trigram_counts = count_trigrams(news_sents)
    for gram in list(trigram_counts)[:3]:
        print(f"{gram}: {trigram_counts[gram]}")
    print()

    # Frequency Distributions ---------------------------------------------------------------------
    print("----------Frequency Distributions ----------")
    print("Bigram Frequency Distribution")
    bigram_freq = bigram_freq_dist(news_sents)
    for word1 in list(bigram_freq)[:2]:
        print(f"\t{word1}: ")
        for word, count in Counter(bigram_freq[word1]).most_common(3):
            print(f"\t\t{word}: {count}")
    print()
    print("Trigram Frequency Distribution")
    trigram_freq = trigram_freq_dist(news_sents)
    for bigram in list(trigram_freq)[:2]:
        print(f"\t{bigram}: ")
        for word, count in Counter(trigram_freq[bigram]).most_common(3):
            print(f"\t\t{word}: {count}")
    print()

    # Probabilities -------------------------------------------------------------------------------
    print("----------Probabilities----------")
    print("Unigram probabilities")
    unigram_probs = unigram_probabilities(news_sents)
    for word in list(unigram_probs)[:3]:
        print(f"\t{word}: {unigram_probs[word]:.3f}")
    print()

    print("Bigram probabilities")
    bigram_probs = bigram_probabilities(news_sents)
    for word1 in list(bigram_probs)[:3]:
        print(f"\t{word1}:")
        for word2, prob in Counter(bigram_probs[word1]).most_common(3):
            print(f"\t\t{word2}: {prob:.5f}")
    print()

    print("Trigram probabilities")
    trigram_probs = trigram_probabilities(news_sents)
    for bigram in list(trigram_probs)[:3]:
        print(f"\t{bigram}:")
        for word, prob in Counter(trigram_probs[bigram]).most_common(3):
            print(f"\t\t{word}: {prob:.3f}")
    print()