def test_isomorphic_multiple_possibilities_simple(self): gram1 = ContextFreeGrammar.from_string( { "origin": ["<a> world", "<b> world"], "a": ["<hello>"], "b": ["<world>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) gram2 = ContextFreeGrammar.from_string( { "origin": ["<1> world", "<2> world"], "1": ["<h>"], "2": ["<w>"], "h": ["hello", "hi", "hey"], "w": ["world", "universe"], } ) gram3 = ContextFreeGrammar.from_string( { "origin": ["<1> world", "<2> world"], "1": ["<w>"], "2": ["<h>"], "w": ["world", "universe"], "h": ["hello", "hi", "hey"], } ) # Test with self self.check_isomorphism(gram1, gram2, gram3) # Test not isomorphic with others self.assertFalse(gram1.is_isomorphic_with(self.hello_world)) self.assertFalse(gram2.is_isomorphic_with(self.hello_world)) self.assertFalse(gram3.is_isomorphic_with(self.hello_world))
def check_grammar_induction_correctness( self, expected_grammar: ContextFreeGrammar, dataset: List[str] = None, words_per_slot=1, prune_redundant=True, minimal_variables=True, ) -> ContextFreeGrammar: if dataset is None: dataset = expected_grammar.generate_all_string() induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=words_per_slot, prune_redundant=prune_redundant, minimal_variables=minimal_variables, ) print(induced_grammar) # Check if same dataset generation self.check_grammar_expansion(induced_grammar, dataset) # Check if isomorph grammar self.assertTrue(expected_grammar.is_isomorphic_with(induced_grammar)) # Check that the grammar is representable as string, without exception self.assertTrue(len(str(induced_grammar)) > 0) return induced_grammar
def setUp(self): random.seed(42) self.hello_world_small = ContextFreeGrammar.from_string({ "origin": ["<hello> <world>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe", "earth"], }) self.hello_world_full = ContextFreeGrammar.from_string({ "origin": "<hello>, <location>!", "hello": ["Hello", "Greetings", "Howdy", "Hey"], "location": ["world", "solar system", "galaxy", "universe"], })
def setUp(self) -> None: random.seed(123) self.hello_world_small = ContextFreeGrammar.from_string({ "origin": ["<hello> <world>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe", "earth"], }) self.hello_world_and_world_adjective = ContextFreeGrammar.from_string({ "origin": ["<hello>, <location>!", "The <location> is <adjective>"], "hello": ["Hello", "Greetings", "Howdy", "Hey"], "location": ["universe", "earth", "world", "solar system"], "adjective": ["pretty", "cool", "amazing"], })
def test_modifier_removal_small(self): # print(re.match(_tracery_slot_modifier, "#a.a#")) self.assertEqual( "#a#", ContextFreeGrammar.replace_modifier_variables("#a.bla#") ) self.assertEqual( "#a#", ContextFreeGrammar.replace_modifier_variables("#a.title#") ) self.assertEqual( "#b#", ContextFreeGrammar.replace_modifier_variables("#b.title#") ) self.assertEqual( "#blabla#", ContextFreeGrammar.replace_modifier_variables("#blabla.title#") )
def test_isomorphic_recursive(self): gram1 = ContextFreeGrammar.from_string( {"origin": ["<a>", "a <origin>"], "a": ["world"],} ) gram2 = ContextFreeGrammar.from_string( {"origin": ["<b>", "a <origin>"], "b": ["world"],} ) conflicting_gram1 = ContextFreeGrammar.from_string( {"origin": ["<b>", "a <origin>"], "b": ["earth"],} ) conflicting_gram2 = ContextFreeGrammar.from_string( {"origin": ["<b>", "b <origin>"], "b": ["world"],} ) self.check_isomorphism(gram1, gram2) self.assertFalse(gram1.is_isomorphic_with(conflicting_gram1)) self.assertFalse(gram1.is_isomorphic_with(conflicting_gram2))
def test_repeat_2_missing_data(self): grammar = ContextFreeGrammar.from_string({ "origin": [ "I like <X> and <X>", "<X> are not supposed to be in the zoo", ], "X": ["cats", "dogs", "geese", "bananas"], }) dataset = [ "I like cats and dogs", "I like bananas and geese", "I like geese and cats", "bananas are not supposed to be in the zoo", "geese are not supposed to be in the zoo", ] induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, prune_redundant=True, minimal_variables=True, max_recalculation=None, relative_similarity_threshold=0.01, ) print(induced_grammar) self.assertTrue(grammar.is_isomorphic_with(induced_grammar)) self.assertFalse(induced_grammar.is_recursive())
def test_prohibit_empty_string(self): dataset = [ "I saw her on the quiet hill", "I saw her on the tall hill", "I saw her on the hill", "He likes cute cats", "He likes nice cats", "He likes cats", ] expected_grammar = ContextFreeGrammar.from_string({ "origin": ["I saw her on the <hill>", "He likes <cats>"], "hill": ["hill", "<hill_adj> hill"], "hill_adj": ["quiet", "tall"], "cats": ["cats", "<cat_adj> cats"], "cat_adj": ["nice", "cute"], }) induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, relative_similarity_threshold=0.1, allow_empty_string=False, log_tree=log_tree, ) print(induced_grammar) self.assertTrue(expected_grammar.is_isomorphic_with(induced_grammar))
def induce_grammar_using_template_trees( lines: Collection[str], relative_similarity_threshold: float = 1, minimal_variables: bool = True, words_per_slot: int = 1, prune_redundant: bool = True, max_recalculation: Optional[int] = None, use_best_merge_candidate=True, max_depth: Optional[int] = None, ): # Learn a tree from the given dataset learned_tree = TemplateLatticeLearner( minimal_variables=minimal_variables, words_per_leaf_slot=words_per_slot, use_best_merge_candidate=use_best_merge_candidate, ).learn(lines) # Prune all redundant children: if all other children of parent cover it, the child is not necessary. if prune_redundant: learned_tree = learned_tree.prune_redundant_abstractions() derived_slot_values, simplified_tree = _name_and_simplify_tree( learned_tree, relative_similarity_threshold) simplified_tree = simplified_tree.collapse_using_slot_values( derived_slot_values) # Keep recalculating the tree until convergence new_tt = None iteration = 0 while simplified_tree != new_tt and (max_recalculation is None or iteration < max_recalculation): if new_tt is not None: simplified_tree = new_tt new_tt = simplified_tree.recalculate_templates( minimal_variables=minimal_variables) derived_slot_values, new_tt = _name_and_simplify_tree( new_tt, relative_similarity_threshold) iteration += 1 # Collapse final tree using the last slot values collapsed_tt = simplified_tree.collapse_using_slot_values( derived_slot_values) # Limit max depth if max_depth is not None: collapsed_tt = collapsed_tt.reduce_depth(max_depth) # Derive final slot values final_slot_values = collapsed_tt.get_slot_values() # Create grammar grammar = ContextFreeGrammar.from_slot_values( collapsed_tt.get_template(), final_slot_values, ) return grammar
def test_from_string(self): input_dict = {"A": ["<B>, world", "hi"], "B": ["hello"]} expected_output = ContextFreeGrammar( { NamedTemplateSlot("A"): [ Template( [ NamedTemplateSlot("B"), TemplateString(","), TemplateString("world"), ] ), Template([TemplateString("hi")]), ], NamedTemplateSlot("B"): [Template([TemplateString("hello")])], } ) output = ContextFreeGrammar.from_string(input_dict) self.assertEqual(expected_output, output)
def setUp(self): random.seed(123) self.simple = ContextFreeGrammar.from_string( {"origin": ["expands only to one texts"]} ) self.hello_world = ContextFreeGrammar.from_string( { "origin": ["<hello> <world>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) self.hello_world_single_a = ContextFreeGrammar.from_string( {"origin": ["<hello> <world>"], "hello": ["hello"], "world": ["world"]} ) self.hello_world_single_b = ContextFreeGrammar.from_string( {"origin": ["<a> <b>"], "a": ["hello"], "b": ["world"]} )
def test_slot_repeat(self): grammar = self.check_grammar_induction_correctness( ContextFreeGrammar.from_string({ "origin": ["<a> <a>"], "a": ["1", "2", "3"], }), words_per_slot=1, minimal_variables=False, ) print(grammar)
def test_not_isomorphic_same_keys(self): gram1 = ContextFreeGrammar.from_string( { "origin": ["<a> <world>"], "a": ["<hello>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) gram2 = ContextFreeGrammar.from_string( { "origin": ["<a> <world>"], "a": ["<hello>"], "hello": ["a", "b", "c"], "world": ["d", "e"], } ) self.assertFalse(gram1.is_isomorphic_with(gram2)) self.assertFalse(gram2.is_isomorphic_with(gram1))
def test_isomorphic_advanced(self): gram1 = ContextFreeGrammar.from_string( { # "origin": ["<a>", "<b>", "<c>", "<d>", "<e>", "<f>", "<g>"], "origin": ["<a>", "<b>", "<c>", "<d>", "<e>"], "a": ["<hello>"], "b": ["<world>"], "c": ["<world>"], "d": ["<world>", "<hello>"], "e": ["<hello> <world>", "<hello>"], "f": ["<a>", "<hello>"], "g": ["<a>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) gram2 = ContextFreeGrammar.from_string( { # "origin": ["<1>", "<2>", "<3>", "<4>", "<5>", "<6>", "<7>"], "origin": ["<1>", "<2>", "<3>", "<4>", "<5>"], "1": ["<h>"], "2": ["<w>"], "3": ["<w>"], "4": ["<w>", "<h>"], "5": ["<h> <w>", "<h>"], "6": ["<1>", "<h>"], "7": ["<1>"], "h": ["hello", "hi", "hey"], "w": ["world", "universe"], } ) # Test with self self.assertTrue(gram1.is_isomorphic_with(gram1)) self.assertTrue(gram2.is_isomorphic_with(gram2)) # Test with other self.assertTrue(gram1.is_isomorphic_with(gram2)) self.assertTrue(gram2.is_isomorphic_with(gram1)) # Check non isomorphic self.assertFalse(gram1.is_isomorphic_with(self.hello_world)) self.assertFalse(gram2.is_isomorphic_with(self.hello_world))
def test_get_depth(self): self.assertEqual(1, self.simple.get_depth()) self.assertEqual(2, self.hello_world.get_depth()) self.assertEqual(2, self.hello_world_single_a.get_depth()) self.assertEqual(2, self.hello_world_single_b.get_depth()) self.assertEqual( 4, ContextFreeGrammar.from_string( {"origin": ["<A>"], "A": ["<B>"], "B": ["<C>"], "C": ["hi"],} ).get_depth(), )
def test_isomorphic_multiple_nt_refs(self): gram1 = ContextFreeGrammar.from_string( { "origin": ["<a> world", "<c> world", "<b> <world>"], "a": ["<hello>"], "c": ["<hello>"], "b": ["<world>", "<hello>"], "hello": ["hello", "hi", "hey", "<world>"], "world": ["world", "universe"], } ) gram2 = ContextFreeGrammar.from_string( { "origin": ["<1> world", "<3> world", "<2> <w>"], "3": ["<h>"], "1": ["<h>"], "2": ["<w>", "<h>"], "h": ["hello", "hi", "hey", "<w>"], "w": ["world", "universe"], } ) conflicting_gram = ContextFreeGrammar.from_string( { "origin": ["<1> world", "<1> world", "<2> <w>"], "1": ["<h>"], "2": ["<w>", "<h>"], "h": ["hello", "hi", "hey", "<w>"], "w": ["world", "universe"], } ) # Test with self self.assertTrue(gram1.is_isomorphic_with(gram1)) self.check_isomorphism(gram1, gram2) # Test not isomorphic with others self.assertFalse(gram1.is_isomorphic_with(conflicting_gram)) self.assertFalse(gram2.is_isomorphic_with(conflicting_gram)) self.assertFalse(gram1.is_isomorphic_with(self.hello_world)) self.assertFalse(gram2.is_isomorphic_with(self.hello_world))
def test_isomorphic_nested(self): gram1 = ContextFreeGrammar.from_string( { "origin": ["<a> <world>"], "a": ["<hello>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) gram2 = ContextFreeGrammar.from_string( { "origin": ["<b> <w>"], "b": ["<c>"], "c": ["hello", "hi", "hey"], "w": ["world", "universe"], } ) # Test with self self.check_isomorphism(gram1, gram2) self.assertFalse(gram1.is_isomorphic_with(self.hello_world_single_a))
def test_hello_world_multiple_origin_options(self): # Check if grammar generates same dataset grammar = self.check_grammar_induction_correctness( ContextFreeGrammar.from_string({ "origin": ["<hello>, <location>!", "The <location> is <adjective>"], "hello": ["Hello", "Greetings", "Howdy", "Hey"], "location": ["world", "universe", "earth"], "adjective": ["pretty", "cool", "awesome"], }), words_per_slot=1, ) print(grammar)
def test_hello_world_multiple_deep(self): # Check if grammar generates same dataset grammar = self.check_grammar_induction_correctness( ContextFreeGrammar.from_string({ "origin": ["<a>, <b>!"], "a": ["1", "2", "3"], "b": ["4", "5", "6", "- <c>"], "c": ["7", "8", "9"], }), words_per_slot=2, minimal_variables=True, ) print(grammar) self.assertFalse(grammar.is_recursive())
def test_isomorphic_advanced_self(self): gram1 = ContextFreeGrammar.from_string( { # "origin": ["<a>", "<b>", "<c>", "<d>", "<e>", "<f>", "<g>"], "origin": ["<a>", "<b>", "<c>", "<d>", "<e>"], "a": ["<hello>"], "b": ["<world>"], "c": ["<world>"], "d": ["<hello>", "<world>"], "e": ["<hello> <world>", "<hello>"], "f": ["<a>", "<hello>"], "g": ["<a>"], "hello": ["hello", "hi", "hey"], "world": ["world", "universe"], } ) self.assertTrue(gram1.is_isomorphic_with(gram1))
def test_no_recursion(self): grammar = ContextFreeGrammar.from_string({ "origin": ["<hello> <location>!", "<hello> there <hello> kid"], "hello": ["hello", "greetings"], "location": ["world", "earth"], }) dataset = [t.to_flat_string() for t in grammar.generate_all()] dataset.sort() induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, prune_redundant=True, minimal_variables=True, max_recalculation=None, ) print(induced_grammar) self.assertFalse(induced_grammar.is_recursive())
def test_repeat_2(self): grammar = ContextFreeGrammar.from_string({ "origin": [ "I really like <X> and <X>", "<X> are not supposed to be in the zoo", ], "X": ["cats", "dogs", "geese", "bananas"], }) dataset = [t.to_flat_string() for t in grammar.generate_all()] dataset.sort() induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, prune_redundant=True, minimal_variables=True, max_recalculation=None, ) print(induced_grammar) self.assertTrue(grammar.is_isomorphic_with(induced_grammar)) self.assertFalse(induced_grammar.is_recursive())
def test_from_text_notebook_example(self): dataset = [ "I like my cat and my dog", "I like my dog and my chicken", "Alice the cat is jumping", "Bob the dog is walking", "Cathy the cat is walking", ] expected_grammar = ContextFreeGrammar.from_string({ "origin": ["<G> the <C> is <D>", "I like my <C> and my <C>"], "C": ["cat", "chicken", "dog"], "G": ["Alice", "Bob", "Cathy"], "D": ["jumping", "walking"], }) induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, relative_similarity_threshold=0.1, ) self.assertTrue(expected_grammar.is_isomorphic_with(induced_grammar))
def test_not_joining_empty_string(self): dataset = [ "I saw him on the quiet hill", "I saw her on the tall hill", "I saw her on the hill", "He likes cute cats", "He likes nice cats", "He likes cats", ] expected_grammar = ContextFreeGrammar.from_string({ "origin": ["I saw <him> on the <adj> hill", "He likes <adj2> cats"], "him": ["her", "him"], "adj": ["", "quiet", "tall"], "adj2": ["", "nice", "cute"], }) induced_grammar = grammar_induction.induce_grammar_using_template_trees( dataset, words_per_slot=1, relative_similarity_threshold=0.1, ) print(induced_grammar) self.assertTrue(expected_grammar.is_isomorphic_with(induced_grammar))
def test_generate_same_name(self): hello_world_single = ContextFreeGrammar.from_string( {"origin": ["I like <X> and <X>"], "X": ["cats", "dogs", "pandas"],} ) possibilities = { "I like cats and cats", "I like cats and dogs", "I like cats and pandas", "I like dogs and cats", "I like dogs and dogs", "I like dogs and pandas", "I like pandas and cats", "I like pandas and dogs", "I like pandas and pandas", } for i in range(100): self.assertTrue( hello_world_single.generate().to_flat_string() in possibilities ) self.assertEqual( possibilities, {g.to_flat_string() for g in hello_world_single.generate_all()}, )
def test_generate_flat(self): hello_world_single = ContextFreeGrammar.from_string( {"origin": ["<hello> <world>"], "hello": ["hello"], "world": ["world"],} ) self.assertEqual("hello world", hello_world_single.generate().to_flat_string())
def check_grammar_expansion(self, grammar: ContextFreeGrammar, expected_expansion: Collection[str]): """ Check that grammar indeed generates the dataset it learned from """ generated_dataset = grammar.generate_all_string() self.assertEqual(set(expected_expansion), set(generated_dataset))
def test_isomorphic_repeat(self): gram1 = ContextFreeGrammar.from_string( {"origin": ["<a>", "<a>", "<b>"], "a": ["<b>"], "b": ["world"],} ) self.assertTrue(gram1.is_isomorphic_with(gram1))