def test_level_1(self): d = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4} phoc = new_unigram_phoc("12345", unigram_map=d, unigram_levels=[1]) self.assertEqual(phoc, (1, 1, 1, 1, 1)) phoc = new_unigram_phoc("34", unigram_map=d, unigram_levels=[1]) self.assertEqual(phoc, (0, 0, 1, 1, 0)) phoc = new_unigram_phoc("1234512345", unigram_map=d, unigram_levels=[1]) self.assertEqual(phoc, (1, 1, 1, 1, 1))
def test_missing_unigram_warning(self): d = {"1": 0, "2": 1, "4": 2, "5": 3} phoc = new_unigram_phoc("12345", unigram_map=d, unigram_levels=[1], ignore_missing=True) self.assertEqual(phoc, (1, 1, 1, 1))
def test_missing_unigram_exception(self): d = {"1": 0, "2": 1, "4": 2, "5": 3} with self.assertRaises(KeyError): new_unigram_phoc("12345", unigram_map=d, unigram_levels=[1])
# Recover George Washington alphabet alphabet = set() for word in vocabulary: alphabet.update([ch for ch in word]) alphabet = sorted(list(alphabet)) # Obtain the different PHOCs and count how many words produce the # same PHOC code. unigram_map = {c: i for i, c in enumerate(alphabet)} phoc_levels = 1 done = False while not done: phoc_counter = {} for word in vocabulary: phoc = new_unigram_phoc(word, unigram_map, [phoc_levels]) if phoc in phoc_counter: phoc_counter[phoc] += 1 else: phoc_counter[phoc] = 1 # Compute PHOC histogram: unique_phocs = [ phoc for phoc, counter in viewitems(phoc_counter) if counter == 1 ] print(phoc_levels, len(unique_phocs), len(unique_phocs) / len(vocabulary)) phoc_levels += 1 if len(unique_phocs) == len(vocabulary): done = True