예제 #1
0
    def test_permutations(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        queries = ['actuellement', 'actualité', 'actu']
        weights = [1, 1, 0]
        for per in itertools.permutations(zip(queries, weights)):
            trie = CompletionSystem([(None, w) for w, p in per])
            trie.compute_metrics()
            # fLOG("----", per)
            for n in trie:
                assert n.mks1 <= n.mks0
            diffs = trie.compare_with_trie()
            if diffs:
                res = [_[-1] for _ in diffs]
                raise Exception("\n".join(res))
예제 #2
0
    def test_build_trie_simple(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        queries = [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')]
        trie = CompletionSystem(queries)
        res = list(trie.items())
        self.assertEqual(len(res), 5)
        res = list(trie.tuples())
        self.assertEqual(
            res, [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')])
        node = trie.find('b')
        assert node is None
        node = trie.find('ab')
        assert node is not None
        self.assertEqual(node.value, 'ab')
        trie.compute_metrics(fLOG=fLOG)
        for el in trie:
            self.assertEqual(el.mks0, el.mks1)
            self.assertEqual(el.mks0, el.mks2)
            s = el.str_mks()
            assert s is not None
        diffs = trie.compare_with_trie(fLOG=fLOG)
        if diffs:
            res = [_[-1] for _ in diffs]
            raise Exception("\n".join(res))
        r = trie[2]
        assert r._info
        s = trie[2].str_all_completions()
        assert s
        assert isinstance(r._info._log_imp, list)
        for k, v in sorted(r._info._completions.items()):
            assert isinstance(v, list)
            if k != '' and len(v) > 2:
                raise Exception(v)
            assert v
            fLOG(k, v)
            for _ in v:
                fLOG("    ", _.value, ":", _.str_mks())
        assert "MKS=3 *=3 |'=3 *=3 |\"=3 *=3" in s
        assert trie.to_dict()
예제 #3
0
    def test_completions(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        this = os.path.abspath(os.path.dirname(__file__))
        data = os.path.join(this, "data", "sample300.txt")
        with open(data, "r", encoding="utf-8") as f:
            lines = [_.strip(" \n\r\t") for _ in f.readlines()]

        trie = CompletionSystem([(None, q) for q in lines])
        diffs = trie.compare_with_trie(fLOG=fLOG)
        if diffs:
            res = [_[-1] for _ in diffs]
            if len(res) > 3:
                res = res[:3]
            raise Exception("\n".join(res))
        assert len(trie) > 0
예제 #4
0
    def test_completions_bug(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        couleur = ["blanc", "vert", "orange",
                   "rouge", "noir", "noire", "blanche"]
        key = "portes"
        mots = ["porch", "porch rouge", "porch vert",
                "porch orange", "pore", "pour"]
        mots.append(key)
        mots += [key + " " + c for c in couleur]
        ens = CompletionSystem(mots)
        diffs = ens.compare_with_trie(fLOG=fLOG)
        if diffs:
            res = [_[-1] for _ in diffs]
            if len(res) > 3:
                res = res[:3]
            raise Exception("\n".join(res))
        assert len(ens) > 0
        m = ens.find("portes blanche")
        self.assertEqual(m.mks2, 7.8)
예제 #5
0
 def gain_dynamique_moyen_par_mot(queries, weights):
     per = list(zip(weights, queries))
     total = sum(w * len(q) for q, w in zip(queries, weights))
     trie = CompletionSystem([(None, q) for _, q in per])
     trie.compute_metrics()
     wks = [(w, p, len(w) - trie.find(w).mks0) for p, w in per]
     wks_dyn = [(w, p, len(w) - trie.find(w).mks1) for p, w in per]
     wks_dyn2 = [(w, p, len(w) - trie.find(w).mks2) for p, w in per]
     gain = sum(g * p / total for w, p, g in wks)
     gain_dyn = sum(g * p / total for w, p, g in wks_dyn)
     gain_dyn2 = sum(g * p / total for w, p, g in wks_dyn2)
     ave_length = sum(len(w) * p / total for p, w in per)
     return gain, gain_dyn, gain_dyn2, ave_length
예제 #6
0
    def test_mks_consistency(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        titles = [(None, '"contra el gang del chicharron"',
                   '"Contra el gang del chicharron')]
        trie = CompletionSystem(titles)
        diffs = trie.compare_with_trie()
        if diffs:
            res = [_[-1] for _ in diffs]
            raise Exception("\n".join(res))

        titles.append((None, '"la sequestree"', '"La séquestrée'))
        trie = CompletionSystem(titles)
        diffs = trie.compare_with_trie()
        if diffs:
            res = [_[-1] for _ in diffs]
            raise Exception("\n".join(res))
예제 #7
0
    def test_mks_consistency_port(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        titles = ["por", "por rouge", "por vert",
                  "por orange", "port", "port blanc", "port rouge"]
        trie = CompletionSystem(titles)
        diffs = trie.compare_with_trie()
        if diffs:
            res = [_[-1] for _ in diffs]
            raise Exception("\n".join(res))

        titles = ["po", "po rouge", "po vert", "po orange",
                  "port", "port blanc", "port rouge"]
        trie = CompletionSystem(titles)
        diffs = trie.compare_with_trie()
        if diffs:
            res = [_[-1] for _ in diffs]
            raise Exception("\n".join(res))
    def test_build_trie_simple(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        comp = [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')]
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, w) for w, q in comp]
        res = cset.test_metric(queries)
        self.assertEqual(res["mks1"], res["sum_wlen"])
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 7, 3: 3, 4: 4})

        comp = [q for w, q in comp]
        comp.reverse()
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, 1) for q in comp]
        for el, found in cset.enumerate_test_metric(queries):
            # fLOG(el, found)
            assert found is not None
        res = cset.test_metric(queries)
        # for k, v in sorted(res.items()): fLOG(k, "=", v)
        assert res["mks1"] < res["sum_wlen"]
        self.assertEqual(res["n"], 5)
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 2, 3: 1, 4: 1})

        # one suggestion in the completion set
        comp = ['a', 'abc', 'bc']
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, 1) for q in comp] + [('abcd', 1)]
        for el, found in cset.enumerate_test_metric(queries):
            if found is None:
                fLOG(el.str_mks(), "*", el.value)
            else:
                fLOG(el.str_mks(), "*", el.value, "*", found, found.str_mks())
            fLOG(el.weight)
            self.assertEqual(el.weight, 1)
            if el.value == "abcd":
                assert found is None
                self.assertEqual(el.mks0, 0)
                self.assertEqual(el.mks1, 3)
                self.assertEqual(el.mks2, 3)
            elif el.value == "abc":
                assert found is not None
                self.assertEqual(el.value, found.value)
                self.assertEqual(el.mks0, found.mks0)
                self.assertEqual(el.mks1, found.mks1)
                self.assertEqual(el.mks2, found.mks2)
        res = cset.test_metric(queries)
        for k, v in sorted(res.items()):
            fLOG(k, "=", v)
        assert res["mks1"] < res["sum_wlen"]
        self.assertEqual(res["n"], 4)
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 1, 3: 1, 4: 1})
    def test_build_trie_simple(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        comp = [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')]
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, w) for w, q in comp]
        res = cset.test_metric(queries)
        self.assertEqual(res["mks1"], res["sum_wlen"])
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 7, 3: 3, 4: 4})

        comp = [q for w, q in comp]
        comp.reverse()
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, 1) for q in comp]
        for el, found in cset.enumerate_test_metric(queries):
            # fLOG(el, found)
            assert found is not None
        res = cset.test_metric(queries)
        # for k, v in sorted(res.items()): fLOG(k, "=", v)
        assert res["mks1"] < res["sum_wlen"]
        self.assertEqual(res["n"], 5)
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 2, 3: 1, 4: 1})

        # one suggestion in the completion set
        comp = ['a', 'abc', 'bc']
        cset = CompletionSystem(comp)
        cset.compute_metrics(fLOG=fLOG)
        queries = [(q, 1) for q in comp] + [('abcd', 1)]
        for el, found in cset.enumerate_test_metric(queries):
            if found is None:
                fLOG(el.str_mks(), "*", el.value)
            else:
                fLOG(el.str_mks(), "*", el.value, "*", found, found.str_mks())
            fLOG(el.weight)
            self.assertEqual(el.weight, 1)
            if el.value == "abcd":
                assert found is None
                self.assertEqual(el.mks0, 0)
                self.assertEqual(el.mks1, 3)
                self.assertEqual(el.mks2, 3)
            elif el.value == "abc":
                assert found is not None
                self.assertEqual(el.value, found.value)
                self.assertEqual(el.mks0, found.mks0)
                self.assertEqual(el.mks1, found.mks1)
                self.assertEqual(el.mks2, found.mks2)
        res = cset.test_metric(queries)
        for k, v in sorted(res.items()):
            fLOG(k, "=", v)
        assert res["mks1"] < res["sum_wlen"]
        self.assertEqual(res["n"], 4)
        self.assertEqual(res["hist"]["l"], {1: 1, 2: 1, 3: 1, 4: 1})
예제 #10
0
    def test_mks_consistency_bigger(self):

        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        def cmks(trie):
            diffs = trie.compare_with_trie(fLOG=fLOG)
            if diffs:
                if len(diffs) > 3:
                    diffs = diffs[:3]
                res = [_[-1] for _ in diffs]
                raise Exception("\n".join(res))

            gmks = 0.0
            gmksd = 0.0
            gmksd2 = 0.0
            nb = 0
            size = 0
            for n in trie:
                if n.mks2 < n.mks1 or (n.value == "baaaab" and n.mks1 != 4):
                    info = ""  # n.str_all_completions()
                    info2 = ""  # n.str_all_completions(use_precompute=True)
                    raise Exception("issue with query '{0}'\n{1}\n##########\n{2}\n############\n{3}".format(
                        n.value, n.str_mks(), info, info2))

                gmks += len(n.value) - n.mks0
                gmksd += len(n.value) - n.mks1
                gmksd2 += len(n.value) - n.mks2
                size += len(n.value)
                nb += 1
            return nb, gmks, gmksd, gmksd2, size

        def gain_dynamique_moyen_par_mot(queries, weights):
            per = list(zip(weights, queries))
            total = sum(w * len(q) for q, w in zip(queries, weights))
            trie = CompletionSystem([(None, q) for _, q in per])
            trie.compute_metrics()
            wks = [(w, p, len(w) - trie.find(w).mks0) for p, w in per]
            wks_dyn = [(w, p, len(w) - trie.find(w).mks1) for p, w in per]
            wks_dyn2 = [(w, p, len(w) - trie.find(w).mks2) for p, w in per]
            gain = sum(g * p / total for w, p, g in wks)
            gain_dyn = sum(g * p / total for w, p, g in wks_dyn)
            gain_dyn2 = sum(g * p / total for w, p, g in wks_dyn2)
            ave_length = sum(len(w) * p / total for p, w in per)
            return gain, gain_dyn, gain_dyn2, ave_length

        this = os.path.abspath(os.path.join(
            os.path.dirname(__file__), "data", "sample_alpha_2.txt"))
        with open(this, "r", encoding="utf-8") as f:
            titles = [_.strip(" \n\r\t") for _ in f.readlines()]
        fLOG(titles[:5])
        trie = CompletionSystem([(None, q) for q in titles])
        trie.compute_metrics(fLOG=fLOG, details=True)
        nb, gmks, gmksd, gmksd2, size = cmks(trie)
        gain, gain_dyn, gain_dyn2, ave_length = gain_dynamique_moyen_par_mot(titles, [
                                                                             1.0] * len(titles))
        fLOG("***", 1, nb, size, "*", gmks / size, gmksd / size, gmksd2 / size)
        fLOG("***", gain, gain_dyn, gain_dyn2, ave_length)
        self.assertEqual(nb, 494)