def test_extract_content_small(self): self.assertEqual( _to_templates([]), Template.from_string("").extract_content(Template.from_string("")), ) self.assertEqual( _to_templates([""]), Template.from_string("[SLOT]").extract_content( Template.from_string("")), ) self.assertEqual( _to_templates(["", "", ""]), Template.from_string("[SLOT] [SLOT] [SLOT]").extract_content( Template.from_string("")), ) self.assertEqual( _to_templates(["a"]), Template.from_string("[SLOT]").extract_content( Template.from_string("a")), ) self.assertEqual( _to_templates(["[SLOT]"]), Template.from_string("[SLOT]").extract_content( Template.from_string("[SLOT]")), )
def calculate_merged_string(string1, string2): merged_templates = Template.merge_templates_wagner_fischer( Template.from_string(string1, slot_token="[SLOT]"), Template.from_string(string2, slot_token="[SLOT]"), allow_longer_template=False ) return next(merged_templates).to_flat_string(detokenizer=lambda x: " ".join(x))
def test_3_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world", "hello universe"] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("[SLOT] world"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hi world"] ], ), TemplateTree( Template.from_string("hello [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hello universe"] ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_learn_hello_world_tree_larger(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = list( self.hello_world_and_world_adjective.generate_all_string()) template_tree = learner.learn(dataset) print(template_tree_visualiser.render_tree_string(template_tree)) pruned_template_tree = template_tree.prune_redundant_abstractions() print( "pruned\n", template_tree_visualiser.render_tree_string(pruned_template_tree), ) # Only two templates in the top top_templates = { tt.get_template() for tt in pruned_template_tree.get_children() } self.assertEqual( { Template.from_string("The [SLOT] is [SLOT]"), Template.from_string("[SLOT], [SLOT]!"), }, top_templates, ) self.assertEqual( set(dataset), set({ t.get_template().to_flat_string() for t in pruned_template_tree.get_descendant_leaves() }), )
def test_collapse_same_children(self): """ Tests if collapsing a tree with children with similar templates will merge correctly """ ss1 = TemplateTree(Template.from_string("a b c c d")) ss2 = TemplateTree(Template.from_string("c b e e d")) ss3 = TemplateTree(Template.from_string("h h h b f d")) ss4 = TemplateTree(Template.from_string("i i i b g d")) ss5 = TemplateTree(Template.from_string("j k l l d")) us1 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss1, ss2] ) us2 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss3, ss4] ) us3 = TemplateTree( Template.from_string("[SLOT] d", slot_token="[SLOT]"), [us1, us2, ss5] ) ts1 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss1, ss2, ss3, ss4], ) ts2 = TemplateTree( Template.from_string("[SLOT] d", slot_token="[SLOT]"), [ts1, ss5] ) collapsed_u = us3.collapse() self.assertEqual(ts2, collapsed_u)
def test_min_empty_sequence_disallow_empty_longer_2(self): learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_1 = Template.from_string("x y z a a b c") template_2 = Template.from_string("x y z b c d") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual(Template.from_string("x y z [SLOT]"), merge.get_merged_template())
def test_2_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world"] template_tree = learner.learn(dataset) expected_top_template = Template.from_string("[SLOT] world") expected = TemplateTree( expected_top_template, [TemplateTree(Template.from_string(s)) for s in dataset], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected_top_template, template_tree.get_template()) self.assertEqual(expected, template_tree)
def test_extract_content_all_ambiguous_2(self): b2 = Template.from_string("a [SLOT] a [SLOT]", slot_token="[SLOT]") t2 = Template.from_string("a a a a") self.assertEqual( { _to_templates(["", "a a"]), _to_templates(["a a", ""]), _to_templates(["a", "a"]), }, b2.extract_content_all(t2), ) # With lowest slot length variance should be picked: self.assertEqual(_to_templates(["a", "a"]), b2.extract_content(t2))
def test_extract_content_all_ambiguous(self): b1 = Template.from_string("[SLOT] [SLOT]", slot_token="[SLOT]") t1 = Template.from_string("a b") self.assertEqual( { _to_templates(["a", "b"]), _to_templates(["a b", ""]), _to_templates(["", "a b"]), }, b1.extract_content_all(t1), ) # With lowest slot length variance should be picked: self.assertEqual(_to_templates(["a", "b"]), b1.extract_content(t1))
def test_min_empty_sequence_longer(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2, allow_empty_string=True) template_1 = Template.from_string( "who sang i want to be with you everywhere") template_2 = Template.from_string( "who sang i only want to be with you") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string( "who sang i [SLOT] want to be with you [SLOT]"), merge.get_merged_template())
def test_equals_new_leaves(self): """ Test if Template Trees are equal if different leaves are used by constructing new trees from scratch""" s1 = TemplateTree(Template.from_string("a b c d")) s2 = TemplateTree(Template.from_string("a b e d")) s3 = TemplateTree(Template.from_string("a b f d")) s4 = TemplateTree(Template.from_string("g b h d")) u1 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s1, s2] ) u2 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s3, u1] ) u2_selfs3 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [self.s3, u1] ) u3 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [s4, u2] ) self.assertEqual(self.s1, s1) self.assertEqual(self.s2, s2) self.assertEqual(self.s3, s3) self.assertEqual(self.s4, s4) self.assertEqual(self.u1, u1) self.assertEqual(self.u2, u2_selfs3) self.assertEqual(self.u2, u2) self.assertEqual(self.u3, u3)
def test_get_best_merge_candidate_hello_world(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("hello world") template_2 = Template.from_string("hi solar system") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2.get_distance(), )
def test_reoccuring_slot(self): dataset = ["I like cats and dogs", "I like dogs and chickens"] grammar = grammar_induction.induce_grammar_using_template_trees( dataset, relative_similarity_threshold=0.1, minimal_variables=True) non_terminals = grammar.get_slots() self.assertEqual(2, len(non_terminals)) word_list_nt = [ s for s in non_terminals if s is not grammar.get_start() ][0] # Assert only one top template origin_templates = grammar.get_content_for(grammar.get_start()) self.assertEqual(1, len(origin_templates)) # Check origin template origin_template: Template = origin_templates[0] self.assertTrue( Template.from_string("I like [SLOT] and [SLOT]").has_same_shape( origin_template)) # Check top template has only one named slot self.assertEqual(1, len(set(origin_template.get_slots()))) # Check if slot has properly merged values self.assertEqual( {"cats", "dogs", "chickens"}, { t.to_flat_string() for t in grammar.get_content_for(word_list_nt) }, )
def test_get_best_merge_candidate(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("The solar system is [SLOT]") template_1_point = Template.from_string("The solar system is [SLOT].") template_2 = Template.from_string("[SLOT], solar system!") template_3 = Template.from_string("The earth is [SLOT]") template_3_point = Template.from_string("The earth is [SLOT].") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_2.get_distance(), ) merge_1_3 = learner._get_best_merge_candidate(template_1, template_3) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]"), merge_1_3.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3.get_distance(), ) # With punctuation version merge_1_2p = learner._get_best_merge_candidate(template_1_point, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2p.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2p.get_distance(), ) merge_1_3p = learner._get_best_merge_candidate(template_1_point, template_3_point) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]."), merge_1_3p.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3p.get_distance(), )
def test_slot_parsing(self): original_string = "a [SLOT] c d" template = Template.from_string(original_string, slot_token="[SLOT]") self.assertEqual(4, template.get_number_of_elements()) self.assertFalse(template._elements[0].is_slot()) self.assertTrue(template._elements[1].is_slot()) self.assertFalse(template._elements[2].is_slot()) self.assertFalse(template._elements[3].is_slot())
def test_disallow_empty_string_hard(self): dataset = [ "I saw her on the quiet hill", "I saw her on the tall hill", "I saw her on the hill", "He likes cute cats", "He likes nice cats", "He likes cats", ] learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT] cats"), [ TemplateTree(Template.from_string(s)) for s in ["He likes cute cats", "He likes nice cats"] ], ), TemplateTree(Template.from_string("He likes cats")), ], ), TemplateTree( Template.from_string("I saw her on the [SLOT]"), [ TemplateTree( Template.from_string( "I saw her on the [SLOT] hill"), [ TemplateTree(Template.from_string(s)) for s in [ "I saw her on the tall hill", "I saw her on the quiet hill", ] ], ), TemplateTree( Template.from_string("I saw her on the hill")), ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_equals(self): """ Tests the TemplateTree __eq__ """ e1 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [self.s1, self.s2], ) self.assertEqual(e1, self.u1) self.assertEqual(e1, e1) self.assertEqual(self.t3, self.t3) self.assertNotEqual(e1, self.u2) self.assertNotEqual(e1, self.t1)
def test_disallow_empty_string_simple_2(self): dataset = [ "He likes cute cats", "He likes nice cats", "He likes cats", "This is another sentence", ] learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT] cats"), [ TemplateTree(Template.from_string(s)) for s in ["He likes cute cats", "He likes nice cats"] ], ), TemplateTree(Template.from_string("He likes cats")), ], ), TemplateTree(Template.from_string("This is another sentence")), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_disallow_empty_string_simple(self): """ Checks whether disallowing empty string in learning works""" learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) dataset = ["I am a human", "I am a nice human", "I am a bad human"] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("I am a [SLOT]"), [ TemplateTree( Template.from_string("I am a [SLOT] human"), [ TemplateTree(Template.from_string(s)) for s in ["I am a nice human", "I am a bad human"] ], ), TemplateTree(Template.from_string("I am a human"), ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_4_line_learner_longer_second(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = [ "hello world", "hi world", "hello solar system", "hi solar system" ] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("[SLOT] world"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hi world"] ], ), TemplateTree( Template.from_string("[SLOT] solar system"), [ TemplateTree(Template.from_string(s)) for s in ["hello solar system", "hi solar system"] ], ), TemplateTree( Template.from_string("hello [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hello solar system"] ], ), TemplateTree( Template.from_string("hi [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hi world", "hi solar system"] ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_named_slot_parsing(self): original_string = "a <A> c d <B>" template = Template.from_string(original_string) self.assertEqual( Template([ TemplateString("a"), NamedTemplateSlot("A"), TemplateString("c"), TemplateString("d"), NamedTemplateSlot("B"), ]), template, )
def test_fill(self): a = self.a b = self.b c = self.c slot1 = self.slot1 slot2 = self.slot2 self.assertEqual( Template.from_string("b"), Template([slot1]).fill(SlotAssignment({slot1: Template([b])})), ) self.assertEqual( Template.from_string("a b"), Template([a, slot1]).fill(SlotAssignment({slot1: Template([b])})), ) self.assertEqual( Template.from_string("a b c"), Template([a, slot1, c]).fill(SlotAssignment({slot1: Template([b])})), ) self.assertEqual( Template.from_string("a b c a"), Template([a, slot1, c, slot2]).fill( SlotAssignment({ slot1: Template([b]), slot2: Template([a]) })), ) self.assertEqual( Template.from_string("a b a c"), Template([a, slot1, slot2, c]).fill( SlotAssignment({ slot1: Template([b]), slot2: Template([a]) })), ) self.assertEqual( Template.from_string("a b a c"), Template([a, slot1, slot2, c]).fill( SlotAssignment({ slot2: Template([a]), slot1: Template([b]) })), ) self.assertEqual( Template.from_string("a b a c"), Template([a, slot1, slot2, c]).fill_with_strings(["b", "a"]), )
def test_covers_string(self): slotted_template_string = "a b [SLOT] d" template = Template.from_string(slotted_template_string, slot_token="[SLOT]") # True covers self.assertTrue(template.covers_string("a b c d")) self.assertTrue(template.covers_string("a b c e d")) self.assertTrue(template.covers_string("a b c e f d")) self.assertTrue(template.covers_string("a b c d d")) self.assertTrue(template.covers_string("a b d")) self.assertTrue(template.covers_string("a b d d d d")) # Not covers self.assertFalse(template.covers_string("a a d d")) self.assertFalse(template.covers_string("a b d c")) self.assertFalse(template.covers_string("a b c")) self.assertFalse(template.covers_string("d"))
def test_extract_content_one(self): b1 = Template.from_string("a [SLOT]", slot_token="[SLOT]") t1 = Template.from_string("a 1") self.assertEqual(_to_templates(["1"]), b1.extract_content(t1)) b2 = Template.from_string("[SLOT] c", slot_token="[SLOT]") t2 = Template.from_string("2 c") self.assertEqual(_to_templates(["2"]), b2.extract_content(t2)) b3 = Template.from_string("a [SLOT] c", slot_token="[SLOT]") t3 = Template.from_string("a 3 c") self.assertEqual(_to_templates(["3"]), b3.extract_content(t3)) self.assertEqual(_to_templates(["3 c"]), b1.extract_content(t3)) self.assertEqual(_to_templates(["a 3"]), b2.extract_content(t3))
def test_get_slot_content(self): self.assertEqual( {(Template.from_string("c"),), (Template.from_string("e"),)}, self.u1.get_slot_contents_tuples(), ) self.assertEqual( {(Template.from_string("[SLOT]"),), (Template.from_string("f"),)}, self.u2.get_slot_contents_tuples(), ) self.assertEqual( { (Template.from_string("c"),), (Template.from_string("e"),), (Template.from_string("f"),), }, self.t1.get_slot_contents_tuples(), )
def test_covers_slotted(self): slotted_template_string = "a b [SLOT] d" template = Template.from_string(slotted_template_string, slot_token="[SLOT]") def covers_slotted(slotted_string): return template.covers( Template.from_string(slotted_string, slot_token="[SLOT]")) # True covers self.assertTrue(covers_slotted("a b [SLOT] d")) self.assertTrue(covers_slotted("a b [SLOT] [SLOT] d")) # Not covers self.assertFalse(covers_slotted("[SLOT]")) self.assertFalse(covers_slotted("a [SLOT] [SLOT] d")) self.assertFalse(covers_slotted("[SLOT] [SLOT] [SLOT] [SLOT]")) self.assertFalse(covers_slotted("a b c d [SLOT]")) self.assertFalse(covers_slotted("a b d [SLOT]")) self.assertFalse(covers_slotted("[SLOT] a b c d")) self.assertFalse(covers_slotted("[SLOT] a b d")) self.assertFalse(covers_slotted("a b [SLOT] d [SLOT]"))
def test_get_descendent_leaves_slot_content(self): # Same tests as before without recursion self.assertEqual( {(Template.from_string("c"),), (Template.from_string("e"),)}, self.u1.get_descendent_leaves_slot_content_tuples(), ) self.assertEqual( { (Template.from_string("c"),), (Template.from_string("e"),), (Template.from_string("f"),), }, self.t1.get_descendent_leaves_slot_content_tuples(), ) # New tests self.assertEqual( { (Template.from_string("c"),), (Template.from_string("e"),), (Template.from_string("f"),), }, self.u2.get_descendent_leaves_slot_content_tuples(), )
def _to_templates(strings: List[str]): return tuple([Template.from_string(s) for s in strings])
def _to_templates(lines: Collection[str]) -> List[Template]: templates = [ Template.from_string(line.strip(), tokenizer=word_tokenize) for line in lines ] return templates
def test_extract_content_two(self): b1 = Template.from_string("a [SLOT] c [SLOT] e", slot_token="[SLOT]") t1 = Template.from_string("a b c d e") self.assertEqual(_to_templates(["b", "d"]), b1.extract_content(t1)) t1 = Template.from_string("a b b c d e") self.assertEqual(_to_templates(["b b", "d"]), b1.extract_content(t1))