def test_3_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world", "hello universe"] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("[SLOT] world"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hi world"] ], ), TemplateTree( Template.from_string("hello [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hello universe"] ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_extract_content_small(self): self.assertEqual( _to_templates([]), Template.from_string("").extract_content(Template.from_string("")), ) self.assertEqual( _to_templates([""]), Template.from_string("[SLOT]").extract_content( Template.from_string("")), ) self.assertEqual( _to_templates(["", "", ""]), Template.from_string("[SLOT] [SLOT] [SLOT]").extract_content( Template.from_string("")), ) self.assertEqual( _to_templates(["a"]), Template.from_string("[SLOT]").extract_content( Template.from_string("a")), ) self.assertEqual( _to_templates(["[SLOT]"]), Template.from_string("[SLOT]").extract_content( Template.from_string("[SLOT]")), )
def calculate_merged_string(string1, string2): merged_templates = Template.merge_templates_wagner_fischer( Template.from_string(string1, slot_token="[SLOT]"), Template.from_string(string2, slot_token="[SLOT]"), allow_longer_template=False ) return next(merged_templates).to_flat_string(detokenizer=lambda x: " ".join(x))
def test_collapse_same_children(self): """ Tests if collapsing a tree with children with similar templates will merge correctly """ ss1 = TemplateTree(Template.from_string("a b c c d")) ss2 = TemplateTree(Template.from_string("c b e e d")) ss3 = TemplateTree(Template.from_string("h h h b f d")) ss4 = TemplateTree(Template.from_string("i i i b g d")) ss5 = TemplateTree(Template.from_string("j k l l d")) us1 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss1, ss2] ) us2 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss3, ss4] ) us3 = TemplateTree( Template.from_string("[SLOT] d", slot_token="[SLOT]"), [us1, us2, ss5] ) ts1 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss1, ss2, ss3, ss4], ) ts2 = TemplateTree( Template.from_string("[SLOT] d", slot_token="[SLOT]"), [ts1, ss5] ) collapsed_u = us3.collapse() self.assertEqual(ts2, collapsed_u)
def test_learn_hello_world_tree_larger(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = list( self.hello_world_and_world_adjective.generate_all_string()) template_tree = learner.learn(dataset) print(template_tree_visualiser.render_tree_string(template_tree)) pruned_template_tree = template_tree.prune_redundant_abstractions() print( "pruned\n", template_tree_visualiser.render_tree_string(pruned_template_tree), ) # Only two templates in the top top_templates = { tt.get_template() for tt in pruned_template_tree.get_children() } self.assertEqual( { Template.from_string("The [SLOT] is [SLOT]"), Template.from_string("[SLOT], [SLOT]!"), }, top_templates, ) self.assertEqual( set(dataset), set({ t.get_template().to_flat_string() for t in pruned_template_tree.get_descendant_leaves() }), )
def test_get_slot_values_same_slot_name(self): t1 = Template([self.a, self.slot_x, self.a, self.slot_y, self.a, self.slot_x]) t2 = Template([self.a, self.b, self.a, self.b, self.a, self.c]) self.assertEqual( {self.slot_x: {self.bt, self.ct}, self.slot_y: {self.bt}}, t1.get_slot_values([t2]), )
def test_min_empty_sequence_disallow_empty_longer_2(self): learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_1 = Template.from_string("x y z a a b c") template_2 = Template.from_string("x y z b c d") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual(Template.from_string("x y z [SLOT]"), merge.get_merged_template())
def test_merge_relative_overlap_values_three_variables_2(self): contents = _create_contents(10) slot_values = SlotValues({ self.a: set(contents[1:5]), self.b: set(contents[0:2]), self.c: set(contents[2:6]), }) # It should not merge if the relative similarity threshold is > 0.2 merged_none = slot_values.merge_slots() self.assertEqual(slot_values, merged_none) merged_1 = slot_values.merge_slots(relative_similarity_threshold=1) self.assertEqual(slot_values, merged_1) merged_061 = slot_values.merge_slots( relative_similarity_threshold=0.61) self.assertEqual(slot_values, merged_061) expected_first_merged = SlotValues({ self.a: set(contents[1:6]), self.b: set(contents[0:2]), self.c: {Template([self.a])}, }) merged_06 = slot_values.merge_slots(relative_similarity_threshold=0.6) self.assertEqual( expected_first_merged, merged_06, ) merged_021 = slot_values.merge_slots( relative_similarity_threshold=0.21) self.assertEqual( expected_first_merged, merged_021, ) expected_full_merged = SlotValues({ self.a: set(contents[0:6]), self.b: {Template([self.a])}, self.c: {Template([self.a])}, }) merged_02 = slot_values.merge_slots(relative_similarity_threshold=0.2) self.assertEqual( expected_full_merged, merged_02, ) merged_01 = slot_values.merge_slots(relative_similarity_threshold=0.1) self.assertEqual( expected_full_merged, merged_01, )
def _get_any_merge_candidate(self, t1: Template, t2: Template) -> MergeCandidate: """ Unused version of _get_best_merge_candidate, but might be prefered for performance gains """ max_length = max(t1.get_number_of_non_slots(), t2.get_number_of_non_slots()) min_slots = min(t1.get_number_of_slots(), t2.get_number_of_slots()) merged_template = next( Template.merge_templates_wagner_fischer( t1, t2, minimal_variables=self._minimal_variables)) return MergeCandidate( t1, t2, _get_distance_of_merged(merged_template, max_length, min_slots))
def test_2_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world"] template_tree = learner.learn(dataset) expected_top_template = Template.from_string("[SLOT] world") expected = TemplateTree( expected_top_template, [TemplateTree(Template.from_string(s)) for s in dataset], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected_top_template, template_tree.get_template()) self.assertEqual(expected, template_tree)
def test_named_slot_parsing(self): original_string = "a <A> c d <B>" template = Template.from_string(original_string) self.assertEqual( Template([ TemplateString("a"), NamedTemplateSlot("A"), TemplateString("c"), TemplateString("d"), NamedTemplateSlot("B"), ]), template, )
def test_min_empty_sequence_longer(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2, allow_empty_string=True) template_1 = Template.from_string( "who sang i want to be with you everywhere") template_2 = Template.from_string( "who sang i only want to be with you") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string( "who sang i [SLOT] want to be with you [SLOT]"), merge.get_merged_template())
def test_extract_content_all_ambiguous_2(self): b2 = Template.from_string("a [SLOT] a [SLOT]", slot_token="[SLOT]") t2 = Template.from_string("a a a a") self.assertEqual( { _to_templates(["", "a a"]), _to_templates(["a a", ""]), _to_templates(["a", "a"]), }, b2.extract_content_all(t2), ) # With lowest slot length variance should be picked: self.assertEqual(_to_templates(["a", "a"]), b2.extract_content(t2))
def test_extract_content_all_ambiguous(self): b1 = Template.from_string("[SLOT] [SLOT]", slot_token="[SLOT]") t1 = Template.from_string("a b") self.assertEqual( { _to_templates(["a", "b"]), _to_templates(["a b", ""]), _to_templates(["", "a b"]), }, b1.extract_content_all(t1), ) # With lowest slot length variance should be picked: self.assertEqual(_to_templates(["a", "b"]), b1.extract_content(t1))
def test_equals_new_leaves(self): """ Test if Template Trees are equal if different leaves are used by constructing new trees from scratch""" s1 = TemplateTree(Template.from_string("a b c d")) s2 = TemplateTree(Template.from_string("a b e d")) s3 = TemplateTree(Template.from_string("a b f d")) s4 = TemplateTree(Template.from_string("g b h d")) u1 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s1, s2] ) u2 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s3, u1] ) u2_selfs3 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [self.s3, u1] ) u3 = TemplateTree( Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [s4, u2] ) self.assertEqual(self.s1, s1) self.assertEqual(self.s2, s2) self.assertEqual(self.s3, s3) self.assertEqual(self.s4, s4) self.assertEqual(self.u1, u1) self.assertEqual(self.u2, u2_selfs3) self.assertEqual(self.u2, u2) self.assertEqual(self.u3, u3)
def test_get_best_merge_candidate_hello_world(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("hello world") template_2 = Template.from_string("hi solar system") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2.get_distance(), )
def setUp(self) -> None: random.seed(123) self.a = TemplateString("a") self.b = TemplateString("b") self.c = TemplateString("c") self.slot1 = TemplateSlot() self.slot2 = TemplateSlot() self.slot_x = NamedTemplateSlot("x") self.slot_y = NamedTemplateSlot("y") self.slot_z = NamedTemplateSlot("z") self.at = Template([self.a]) self.bt = Template([self.b]) self.ct = Template([self.c])
def _contains_slot_as_template(vals: Collection["Template"], slot: TemplateSlot) -> bool: """ Checks if the template elements contain the given slot as a single template element """ slot_as_template = Template((slot, )) return slot_as_template in vals
def test_merge_relative_overlap_values(self): contents = _create_contents(10) slot_values = SlotValues({ self.a: set(contents), self.b: _shuffled_subset(contents, 0, 2), }) # It should not merge if the relative similarity threshold is > 0.2 merged_none = slot_values.merge_slots() self.assertEqual(slot_values, merged_none) merged_1 = slot_values.merge_slots(relative_similarity_threshold=1) self.assertEqual(slot_values, merged_1) merged_09 = slot_values.merge_slots(relative_similarity_threshold=0.9) self.assertEqual(slot_values, merged_09) merged_05 = slot_values.merge_slots(relative_similarity_threshold=0.5) self.assertEqual(slot_values, merged_05) # B should merge into A if the threshold is <= 0.2 expected_merged = SlotValues({ self.a: set(contents), self.b: {Template([self.a])} }) merged_02 = slot_values.merge_slots(relative_similarity_threshold=0.2) self.assertEqual(expected_merged, merged_02) merged_01 = slot_values.merge_slots(relative_similarity_threshold=0.1) self.assertEqual(expected_merged, merged_01)
def test_reoccuring_slot(self): dataset = ["I like cats and dogs", "I like dogs and chickens"] grammar = grammar_induction.induce_grammar_using_template_trees( dataset, relative_similarity_threshold=0.1, minimal_variables=True) non_terminals = grammar.get_slots() self.assertEqual(2, len(non_terminals)) word_list_nt = [ s for s in non_terminals if s is not grammar.get_start() ][0] # Assert only one top template origin_templates = grammar.get_content_for(grammar.get_start()) self.assertEqual(1, len(origin_templates)) # Check origin template origin_template: Template = origin_templates[0] self.assertTrue( Template.from_string("I like [SLOT] and [SLOT]").has_same_shape( origin_template)) # Check top template has only one named slot self.assertEqual(1, len(set(origin_template.get_slots()))) # Check if slot has properly merged values self.assertEqual( {"cats", "dogs", "chickens"}, { t.to_flat_string() for t in grammar.get_content_for(word_list_nt) }, )
def _recalculate_templates( self, recalculate_cache: Dict["TemplateTree", "TemplateTree"], minimal_variables: bool, ) -> "TemplateTree": # Check if already recalculated if self in recalculate_cache: return recalculate_cache[self] # Map all children mapped_children = [ child._recalculate_templates(recalculate_cache, minimal_variables) for child in self._children ] new_template = ( Template.merge_all( [c.get_template() for c in mapped_children], minimal_variables, self.get_template(), ) if len(mapped_children) > 0 else self._template ) # Create new result = TemplateTree(new_template, mapped_children) # Cache recalculate_cache[self] = result return result
def test_get_best_merge_candidate(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("The solar system is [SLOT]") template_1_point = Template.from_string("The solar system is [SLOT].") template_2 = Template.from_string("[SLOT], solar system!") template_3 = Template.from_string("The earth is [SLOT]") template_3_point = Template.from_string("The earth is [SLOT].") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_2.get_distance(), ) merge_1_3 = learner._get_best_merge_candidate(template_1, template_3) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]"), merge_1_3.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3.get_distance(), ) # With punctuation version merge_1_2p = learner._get_best_merge_candidate(template_1_point, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2p.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2p.get_distance(), ) merge_1_3p = learner._get_best_merge_candidate(template_1_point, template_3_point) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]."), merge_1_3p.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3p.get_distance(), )
def test_slot_parsing(self): original_string = "a [SLOT] c d" template = Template.from_string(original_string, slot_token="[SLOT]") self.assertEqual(4, template.get_number_of_elements()) self.assertFalse(template._elements[0].is_slot()) self.assertTrue(template._elements[1].is_slot()) self.assertFalse(template._elements[2].is_slot()) self.assertFalse(template._elements[3].is_slot())
def test_disallow_empty_string_hard(self): dataset = [ "I saw her on the quiet hill", "I saw her on the tall hill", "I saw her on the hill", "He likes cute cats", "He likes nice cats", "He likes cats", ] learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT] cats"), [ TemplateTree(Template.from_string(s)) for s in ["He likes cute cats", "He likes nice cats"] ], ), TemplateTree(Template.from_string("He likes cats")), ], ), TemplateTree( Template.from_string("I saw her on the [SLOT]"), [ TemplateTree( Template.from_string( "I saw her on the [SLOT] hill"), [ TemplateTree(Template.from_string(s)) for s in [ "I saw her on the tall hill", "I saw her on the quiet hill", ] ], ), TemplateTree( Template.from_string("I saw her on the hill")), ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_from_string(self): input_dict = {"A": ["<B>, world", "hi"], "B": ["hello"]} expected_output = ContextFreeGrammar( { NamedTemplateSlot("A"): [ Template( [ NamedTemplateSlot("B"), TemplateString(","), TemplateString("world"), ] ), Template([TemplateString("hi")]), ], NamedTemplateSlot("B"): [Template([TemplateString("hello")])], } ) output = ContextFreeGrammar.from_string(input_dict) self.assertEqual(expected_output, output)
def test_merge_containing_slot(self): slot_values = SlotValues({ self.a: {Template([self.b]), self.e1, self.e2, self.e3}, self.b: self.e123, self.c: self.e456, }) merged = slot_values.merge_slots() self.assertEqual(hashabledict({self.a: self.b}), merged.get_replacements()) self.assertEqual( SlotValues({ self.a: {Template([self.b])}, self.b: self.e123, self.c: self.e456 }), merged, )
def _get_best_merge_candidate(self, t1: Template, t2: Template) -> MergeCandidate: """ Calculates the distance between two given templates, that can contain slots """ max_length = max(t1.get_number_of_non_slots(), t2.get_number_of_non_slots()) min_slots = min(t1.get_number_of_slots(), t2.get_number_of_slots()) merged_templates = set( Template.merge_templates_wagner_fischer( t1, t2, minimal_variables=self._minimal_variables)) merge_candidates = [] for merged_template in merged_templates: distance = _get_distance_of_merged(merged_template, max_length, min_slots) merge_candidates.append( MergeCandidate(t1, t2, distance, merged=merged_template)) return min(merge_candidates)
def test_get_slot_content_mappings(self): self.assertEqual(set(), self.s1.get_slot_content_mappings()) slot1 = NamedTemplateSlot("x") slot2 = NamedTemplateSlot("y") a = TemplateString("a") b = TemplateString("b") c = TemplateString("c") # Simple tree simple_tree = TemplateTree( Template([a, slot1]), [TemplateTree(Template([a, b]), [])] ) simple_slot_contents = simple_tree.get_slot_content_mappings() self.assertEqual(1, len(simple_slot_contents)) simple_slot_content = list(simple_slot_contents)[0] self.assertTrue(slot1 in simple_slot_content) self.assertTrue(slot1 in simple_slot_content.keys()) self.assertEqual(Template([b]), simple_slot_content[slot1]) self.assertEqual({SlotAssignment({slot1: Template([b])})}, simple_slot_contents) # Two slot tree two_slot_tree = TemplateTree( Template([slot1, b, slot2]), [TemplateTree(Template([a, b, c]), [])] ) two_slot_tree_contents = two_slot_tree.get_slot_content_mappings() self.assertEqual( {SlotAssignment({slot1: Template([a]), slot2: Template([c])})}, two_slot_tree_contents, ) # Test tree u1_slot = self.u1.get_template().get_slots()[0] self.assertEqual( { SlotAssignment({u1_slot: Template([TemplateString("c")])}), SlotAssignment({u1_slot: Template([TemplateString("e")])}), }, self.u1.get_slot_content_mappings(), )
def _merge_templates(t1: Template, t2: Template, minimal_variables: bool, allow_empty_string=True) -> Template: return next( Template.merge_templates_wagner_fischer( t1, t2, minimal_variables=minimal_variables, allow_empty_string=allow_empty_string, ))
def test_equals(self): """ Tests the TemplateTree __eq__ """ e1 = TemplateTree( Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [self.s1, self.s2], ) self.assertEqual(e1, self.u1) self.assertEqual(e1, e1) self.assertEqual(self.t3, self.t3) self.assertNotEqual(e1, self.u2) self.assertNotEqual(e1, self.t1)