예제 #1
0
    def test_3_line_learner(self):
        learner = TemplateLatticeLearner(minimal_variables=True)
        dataset = ["hello world", "hi world", "hello universe"]
        template_tree = learner.learn(dataset)

        expected = TemplateTree(
            Template.from_string("[SLOT]"),
            [
                TemplateTree(
                    Template.from_string("[SLOT] world"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hi world"]
                    ],
                ),
                TemplateTree(
                    Template.from_string("hello [SLOT]"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hello universe"]
                    ],
                ),
            ],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected, template_tree)
예제 #2
0
 def test_extract_content_small(self):
     self.assertEqual(
         _to_templates([]),
         Template.from_string("").extract_content(Template.from_string("")),
     )
     self.assertEqual(
         _to_templates([""]),
         Template.from_string("[SLOT]").extract_content(
             Template.from_string("")),
     )
     self.assertEqual(
         _to_templates(["", "", ""]),
         Template.from_string("[SLOT] [SLOT] [SLOT]").extract_content(
             Template.from_string("")),
     )
     self.assertEqual(
         _to_templates(["a"]),
         Template.from_string("[SLOT]").extract_content(
             Template.from_string("a")),
     )
     self.assertEqual(
         _to_templates(["[SLOT]"]),
         Template.from_string("[SLOT]").extract_content(
             Template.from_string("[SLOT]")),
     )
예제 #3
0
def calculate_merged_string(string1, string2):
    merged_templates = Template.merge_templates_wagner_fischer(
        Template.from_string(string1, slot_token="[SLOT]"),
        Template.from_string(string2, slot_token="[SLOT]"),
        allow_longer_template=False
    )
    return next(merged_templates).to_flat_string(detokenizer=lambda x: " ".join(x))
예제 #4
0
    def test_collapse_same_children(self):
        """ Tests if collapsing a tree with children with similar templates will merge correctly """
        ss1 = TemplateTree(Template.from_string("a b c c d"))
        ss2 = TemplateTree(Template.from_string("c b e e d"))
        ss3 = TemplateTree(Template.from_string("h h h b f d"))
        ss4 = TemplateTree(Template.from_string("i i i b g d"))
        ss5 = TemplateTree(Template.from_string("j k l l d"))

        us1 = TemplateTree(
            Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss1, ss2]
        )
        us2 = TemplateTree(
            Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [ss3, ss4]
        )
        us3 = TemplateTree(
            Template.from_string("[SLOT] d", slot_token="[SLOT]"), [us1, us2, ss5]
        )

        ts1 = TemplateTree(
            Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"),
            [ss1, ss2, ss3, ss4],
        )
        ts2 = TemplateTree(
            Template.from_string("[SLOT] d", slot_token="[SLOT]"), [ts1, ss5]
        )

        collapsed_u = us3.collapse()
        self.assertEqual(ts2, collapsed_u)
예제 #5
0
    def test_learn_hello_world_tree_larger(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        dataset = list(
            self.hello_world_and_world_adjective.generate_all_string())
        template_tree = learner.learn(dataset)
        print(template_tree_visualiser.render_tree_string(template_tree))

        pruned_template_tree = template_tree.prune_redundant_abstractions()
        print(
            "pruned\n",
            template_tree_visualiser.render_tree_string(pruned_template_tree),
        )

        # Only two templates in the top
        top_templates = {
            tt.get_template()
            for tt in pruned_template_tree.get_children()
        }
        self.assertEqual(
            {
                Template.from_string("The [SLOT] is [SLOT]"),
                Template.from_string("[SLOT], [SLOT]!"),
            },
            top_templates,
        )
        self.assertEqual(
            set(dataset),
            set({
                t.get_template().to_flat_string()
                for t in pruned_template_tree.get_descendant_leaves()
            }),
        )
예제 #6
0
    def test_get_slot_values_same_slot_name(self):
        t1 = Template([self.a, self.slot_x, self.a, self.slot_y, self.a, self.slot_x])
        t2 = Template([self.a, self.b, self.a, self.b, self.a, self.c])

        self.assertEqual(
            {self.slot_x: {self.bt, self.ct}, self.slot_y: {self.bt}},
            t1.get_slot_values([t2]),
        )
예제 #7
0
    def test_min_empty_sequence_disallow_empty_longer_2(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         allow_empty_string=False)
        template_1 = Template.from_string("x y z a a b c")
        template_2 = Template.from_string("x y z b c d")
        merge = learner._get_best_merge_candidate(template_1, template_2)

        self.assertEqual(Template.from_string("x y z [SLOT]"),
                         merge.get_merged_template())
예제 #8
0
    def test_merge_relative_overlap_values_three_variables_2(self):
        contents = _create_contents(10)
        slot_values = SlotValues({
            self.a: set(contents[1:5]),
            self.b: set(contents[0:2]),
            self.c: set(contents[2:6]),
        })

        # It should not merge if the relative similarity threshold is > 0.2
        merged_none = slot_values.merge_slots()
        self.assertEqual(slot_values, merged_none)

        merged_1 = slot_values.merge_slots(relative_similarity_threshold=1)
        self.assertEqual(slot_values, merged_1)

        merged_061 = slot_values.merge_slots(
            relative_similarity_threshold=0.61)
        self.assertEqual(slot_values, merged_061)

        expected_first_merged = SlotValues({
            self.a: set(contents[1:6]),
            self.b: set(contents[0:2]),
            self.c: {Template([self.a])},
        })

        merged_06 = slot_values.merge_slots(relative_similarity_threshold=0.6)
        self.assertEqual(
            expected_first_merged,
            merged_06,
        )

        merged_021 = slot_values.merge_slots(
            relative_similarity_threshold=0.21)
        self.assertEqual(
            expected_first_merged,
            merged_021,
        )

        expected_full_merged = SlotValues({
            self.a: set(contents[0:6]),
            self.b: {Template([self.a])},
            self.c: {Template([self.a])},
        })
        merged_02 = slot_values.merge_slots(relative_similarity_threshold=0.2)
        self.assertEqual(
            expected_full_merged,
            merged_02,
        )
        merged_01 = slot_values.merge_slots(relative_similarity_threshold=0.1)
        self.assertEqual(
            expected_full_merged,
            merged_01,
        )
예제 #9
0
 def _get_any_merge_candidate(self, t1: Template,
                              t2: Template) -> MergeCandidate:
     """ Unused version of _get_best_merge_candidate, but might be prefered for performance gains """
     max_length = max(t1.get_number_of_non_slots(),
                      t2.get_number_of_non_slots())
     min_slots = min(t1.get_number_of_slots(), t2.get_number_of_slots())
     merged_template = next(
         Template.merge_templates_wagner_fischer(
             t1, t2, minimal_variables=self._minimal_variables))
     return MergeCandidate(
         t1, t2,
         _get_distance_of_merged(merged_template, max_length, min_slots))
예제 #10
0
    def test_2_line_learner(self):
        learner = TemplateLatticeLearner(minimal_variables=True)
        dataset = ["hello world", "hi world"]
        template_tree = learner.learn(dataset)

        expected_top_template = Template.from_string("[SLOT] world")
        expected = TemplateTree(
            expected_top_template,
            [TemplateTree(Template.from_string(s)) for s in dataset],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected_top_template, template_tree.get_template())
        self.assertEqual(expected, template_tree)
예제 #11
0
    def test_named_slot_parsing(self):
        original_string = "a <A> c d <B>"
        template = Template.from_string(original_string)

        self.assertEqual(
            Template([
                TemplateString("a"),
                NamedTemplateSlot("A"),
                TemplateString("c"),
                TemplateString("d"),
                NamedTemplateSlot("B"),
            ]),
            template,
        )
예제 #12
0
    def test_min_empty_sequence_longer(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2,
                                         allow_empty_string=True)
        template_1 = Template.from_string(
            "who sang i want to be with you everywhere")
        template_2 = Template.from_string(
            "who sang i only want to be with you")

        merge = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string(
                "who sang i [SLOT] want to be with you [SLOT]"),
            merge.get_merged_template())
예제 #13
0
    def test_extract_content_all_ambiguous_2(self):
        b2 = Template.from_string("a [SLOT] a [SLOT]", slot_token="[SLOT]")
        t2 = Template.from_string("a a a a")
        self.assertEqual(
            {
                _to_templates(["", "a a"]),
                _to_templates(["a a", ""]),
                _to_templates(["a", "a"]),
            },
            b2.extract_content_all(t2),
        )

        # With lowest slot length variance should be picked:
        self.assertEqual(_to_templates(["a", "a"]), b2.extract_content(t2))
예제 #14
0
    def test_extract_content_all_ambiguous(self):
        b1 = Template.from_string("[SLOT] [SLOT]", slot_token="[SLOT]")
        t1 = Template.from_string("a b")
        self.assertEqual(
            {
                _to_templates(["a", "b"]),
                _to_templates(["a b", ""]),
                _to_templates(["", "a b"]),
            },
            b1.extract_content_all(t1),
        )

        # With lowest slot length variance should be picked:
        self.assertEqual(_to_templates(["a", "b"]), b1.extract_content(t1))
예제 #15
0
    def test_equals_new_leaves(self):
        """ Test if Template Trees are equal if different leaves are used by constructing new trees from scratch"""

        s1 = TemplateTree(Template.from_string("a b c d"))
        s2 = TemplateTree(Template.from_string("a b e d"))
        s3 = TemplateTree(Template.from_string("a b f d"))
        s4 = TemplateTree(Template.from_string("g b h d"))
        u1 = TemplateTree(
            Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s1, s2]
        )
        u2 = TemplateTree(
            Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [s3, u1]
        )
        u2_selfs3 = TemplateTree(
            Template.from_string("a b [SLOT] d", slot_token="[SLOT]"), [self.s3, u1]
        )
        u3 = TemplateTree(
            Template.from_string("[SLOT] b [SLOT] d", slot_token="[SLOT]"), [s4, u2]
        )

        self.assertEqual(self.s1, s1)
        self.assertEqual(self.s2, s2)
        self.assertEqual(self.s3, s3)
        self.assertEqual(self.s4, s4)
        self.assertEqual(self.u1, u1)
        self.assertEqual(self.u2, u2_selfs3)
        self.assertEqual(self.u2, u2)
        self.assertEqual(self.u3, u3)
예제 #16
0
    def test_get_best_merge_candidate_hello_world(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        template_1 = Template.from_string("hello world")
        template_2 = Template.from_string("hi solar system")

        merge_1_2 = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string("[SLOT]"),
            merge_1_2.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            4,
            merge_1_2.get_distance(),
        )
예제 #17
0
    def setUp(self) -> None:
        random.seed(123)

        self.a = TemplateString("a")
        self.b = TemplateString("b")
        self.c = TemplateString("c")
        self.slot1 = TemplateSlot()
        self.slot2 = TemplateSlot()
        self.slot_x = NamedTemplateSlot("x")
        self.slot_y = NamedTemplateSlot("y")
        self.slot_z = NamedTemplateSlot("z")

        self.at = Template([self.a])
        self.bt = Template([self.b])
        self.ct = Template([self.c])
예제 #18
0
def _contains_slot_as_template(vals: Collection["Template"],
                               slot: TemplateSlot) -> bool:
    """
    Checks if the template elements contain the given slot as a single template element
    """
    slot_as_template = Template((slot, ))
    return slot_as_template in vals
예제 #19
0
    def test_merge_relative_overlap_values(self):
        contents = _create_contents(10)
        slot_values = SlotValues({
            self.a: set(contents),
            self.b: _shuffled_subset(contents, 0, 2),
        })

        # It should not merge if the relative similarity threshold is > 0.2
        merged_none = slot_values.merge_slots()
        self.assertEqual(slot_values, merged_none)

        merged_1 = slot_values.merge_slots(relative_similarity_threshold=1)
        self.assertEqual(slot_values, merged_1)

        merged_09 = slot_values.merge_slots(relative_similarity_threshold=0.9)
        self.assertEqual(slot_values, merged_09)

        merged_05 = slot_values.merge_slots(relative_similarity_threshold=0.5)
        self.assertEqual(slot_values, merged_05)

        # B should merge into A if the threshold is <= 0.2
        expected_merged = SlotValues({
            self.a: set(contents),
            self.b: {Template([self.a])}
        })

        merged_02 = slot_values.merge_slots(relative_similarity_threshold=0.2)
        self.assertEqual(expected_merged, merged_02)

        merged_01 = slot_values.merge_slots(relative_similarity_threshold=0.1)
        self.assertEqual(expected_merged, merged_01)
예제 #20
0
    def test_reoccuring_slot(self):
        dataset = ["I like cats and dogs", "I like dogs and chickens"]
        grammar = grammar_induction.induce_grammar_using_template_trees(
            dataset, relative_similarity_threshold=0.1, minimal_variables=True)

        non_terminals = grammar.get_slots()
        self.assertEqual(2, len(non_terminals))
        word_list_nt = [
            s for s in non_terminals if s is not grammar.get_start()
        ][0]

        # Assert only one top template
        origin_templates = grammar.get_content_for(grammar.get_start())
        self.assertEqual(1, len(origin_templates))

        # Check origin template
        origin_template: Template = origin_templates[0]
        self.assertTrue(
            Template.from_string("I like [SLOT] and [SLOT]").has_same_shape(
                origin_template))

        # Check top template has only one named slot
        self.assertEqual(1, len(set(origin_template.get_slots())))

        # Check if slot has properly merged values
        self.assertEqual(
            {"cats", "dogs", "chickens"},
            {
                t.to_flat_string()
                for t in grammar.get_content_for(word_list_nt)
            },
        )
예제 #21
0
    def _recalculate_templates(
        self,
        recalculate_cache: Dict["TemplateTree", "TemplateTree"],
        minimal_variables: bool,
    ) -> "TemplateTree":
        # Check if already recalculated
        if self in recalculate_cache:
            return recalculate_cache[self]

        # Map all children
        mapped_children = [
            child._recalculate_templates(recalculate_cache, minimal_variables)
            for child in self._children
        ]
        new_template = (
            Template.merge_all(
                [c.get_template() for c in mapped_children],
                minimal_variables,
                self.get_template(),
            )
            if len(mapped_children) > 0
            else self._template
        )

        # Create new
        result = TemplateTree(new_template, mapped_children)

        # Cache
        recalculate_cache[self] = result

        return result
예제 #22
0
    def test_get_best_merge_candidate(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        template_1 = Template.from_string("The solar system is [SLOT]")
        template_1_point = Template.from_string("The solar system is [SLOT].")
        template_2 = Template.from_string("[SLOT], solar system!")

        template_3 = Template.from_string("The earth is [SLOT]")
        template_3_point = Template.from_string("The earth is [SLOT].")

        merge_1_2 = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string("[SLOT] solar system [SLOT]"),
            merge_1_2.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_2.get_distance(),
        )

        merge_1_3 = learner._get_best_merge_candidate(template_1, template_3)
        self.assertEqual(
            Template.from_string("The [SLOT] is [SLOT]"),
            merge_1_3.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_3.get_distance(),
        )

        # With punctuation version
        merge_1_2p = learner._get_best_merge_candidate(template_1_point,
                                                       template_2)
        self.assertEqual(
            Template.from_string("[SLOT] solar system [SLOT]"),
            merge_1_2p.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            4,
            merge_1_2p.get_distance(),
        )

        merge_1_3p = learner._get_best_merge_candidate(template_1_point,
                                                       template_3_point)
        self.assertEqual(
            Template.from_string("The [SLOT] is [SLOT]."),
            merge_1_3p.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_3p.get_distance(),
        )
예제 #23
0
 def test_slot_parsing(self):
     original_string = "a [SLOT] c d"
     template = Template.from_string(original_string, slot_token="[SLOT]")
     self.assertEqual(4, template.get_number_of_elements())
     self.assertFalse(template._elements[0].is_slot())
     self.assertTrue(template._elements[1].is_slot())
     self.assertFalse(template._elements[2].is_slot())
     self.assertFalse(template._elements[3].is_slot())
예제 #24
0
 def test_disallow_empty_string_hard(self):
     dataset = [
         "I saw her on the quiet hill",
         "I saw her on the tall hill",
         "I saw her on the hill",
         "He likes cute cats",
         "He likes nice cats",
         "He likes cats",
     ]
     learner = TemplateLatticeLearner(minimal_variables=True,
                                      allow_empty_string=False)
     template_tree = learner.learn(dataset)
     expected = TemplateTree(
         Template.from_string("[SLOT]"),
         [
             TemplateTree(
                 Template.from_string("He likes [SLOT]"),
                 [
                     TemplateTree(
                         Template.from_string("He likes [SLOT] cats"),
                         [
                             TemplateTree(Template.from_string(s)) for s in
                             ["He likes cute cats", "He likes nice cats"]
                         ],
                     ),
                     TemplateTree(Template.from_string("He likes cats")),
                 ],
             ),
             TemplateTree(
                 Template.from_string("I saw her on the [SLOT]"),
                 [
                     TemplateTree(
                         Template.from_string(
                             "I saw her on the [SLOT] hill"),
                         [
                             TemplateTree(Template.from_string(s))
                             for s in [
                                 "I saw her on the tall hill",
                                 "I saw her on the quiet hill",
                             ]
                         ],
                     ),
                     TemplateTree(
                         Template.from_string("I saw her on the hill")),
                 ],
             ),
         ],
     )
     print(template_tree_visualiser.render_tree_string(template_tree))
     self.assertEqual(expected, template_tree)
예제 #25
0
 def test_from_string(self):
     input_dict = {"A": ["<B>, world", "hi"], "B": ["hello"]}
     expected_output = ContextFreeGrammar(
         {
             NamedTemplateSlot("A"): [
                 Template(
                     [
                         NamedTemplateSlot("B"),
                         TemplateString(","),
                         TemplateString("world"),
                     ]
                 ),
                 Template([TemplateString("hi")]),
             ],
             NamedTemplateSlot("B"): [Template([TemplateString("hello")])],
         }
     )
     output = ContextFreeGrammar.from_string(input_dict)
     self.assertEqual(expected_output, output)
예제 #26
0
    def test_merge_containing_slot(self):
        slot_values = SlotValues({
            self.a: {Template([self.b]), self.e1, self.e2, self.e3},
            self.b: self.e123,
            self.c: self.e456,
        })

        merged = slot_values.merge_slots()

        self.assertEqual(hashabledict({self.a: self.b}),
                         merged.get_replacements())
        self.assertEqual(
            SlotValues({
                self.a: {Template([self.b])},
                self.b: self.e123,
                self.c: self.e456
            }),
            merged,
        )
예제 #27
0
    def _get_best_merge_candidate(self, t1: Template,
                                  t2: Template) -> MergeCandidate:
        """
        Calculates the distance between two given templates, that can contain slots
        """
        max_length = max(t1.get_number_of_non_slots(),
                         t2.get_number_of_non_slots())
        min_slots = min(t1.get_number_of_slots(), t2.get_number_of_slots())

        merged_templates = set(
            Template.merge_templates_wagner_fischer(
                t1, t2, minimal_variables=self._minimal_variables))
        merge_candidates = []
        for merged_template in merged_templates:
            distance = _get_distance_of_merged(merged_template, max_length,
                                               min_slots)
            merge_candidates.append(
                MergeCandidate(t1, t2, distance, merged=merged_template))
        return min(merge_candidates)
예제 #28
0
    def test_get_slot_content_mappings(self):
        self.assertEqual(set(), self.s1.get_slot_content_mappings())

        slot1 = NamedTemplateSlot("x")
        slot2 = NamedTemplateSlot("y")
        a = TemplateString("a")
        b = TemplateString("b")
        c = TemplateString("c")

        # Simple tree
        simple_tree = TemplateTree(
            Template([a, slot1]), [TemplateTree(Template([a, b]), [])]
        )
        simple_slot_contents = simple_tree.get_slot_content_mappings()

        self.assertEqual(1, len(simple_slot_contents))
        simple_slot_content = list(simple_slot_contents)[0]
        self.assertTrue(slot1 in simple_slot_content)
        self.assertTrue(slot1 in simple_slot_content.keys())
        self.assertEqual(Template([b]), simple_slot_content[slot1])

        self.assertEqual({SlotAssignment({slot1: Template([b])})}, simple_slot_contents)

        # Two slot tree
        two_slot_tree = TemplateTree(
            Template([slot1, b, slot2]), [TemplateTree(Template([a, b, c]), [])]
        )
        two_slot_tree_contents = two_slot_tree.get_slot_content_mappings()
        self.assertEqual(
            {SlotAssignment({slot1: Template([a]), slot2: Template([c])})},
            two_slot_tree_contents,
        )

        # Test tree
        u1_slot = self.u1.get_template().get_slots()[0]
        self.assertEqual(
            {
                SlotAssignment({u1_slot: Template([TemplateString("c")])}),
                SlotAssignment({u1_slot: Template([TemplateString("e")])}),
            },
            self.u1.get_slot_content_mappings(),
        )
예제 #29
0
def _merge_templates(t1: Template,
                     t2: Template,
                     minimal_variables: bool,
                     allow_empty_string=True) -> Template:
    return next(
        Template.merge_templates_wagner_fischer(
            t1,
            t2,
            minimal_variables=minimal_variables,
            allow_empty_string=allow_empty_string,
        ))
예제 #30
0
 def test_equals(self):
     """ Tests the TemplateTree __eq__ """
     e1 = TemplateTree(
         Template.from_string("a b [SLOT] d", slot_token="[SLOT]"),
         [self.s1, self.s2],
     )
     self.assertEqual(e1, self.u1)
     self.assertEqual(e1, e1)
     self.assertEqual(self.t3, self.t3)
     self.assertNotEqual(e1, self.u2)
     self.assertNotEqual(e1, self.t1)