Exemplo n.º 1
0
def test_dedup_with_np_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    with dedup.HashesCollector(field="text", output=hashes) as d:
        list(d.map(documents))

    results = FlatHashSet()
    results.load_np(hashes)
    expected = set(
        str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    )
    assert expected == set(results.keys())
Exemplo n.º 2
0
    def test_dedup_with_np_dump(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "_World", "I'm so original")),
            dict(text=text("_world", "I'm originaler", "_Hello")),
        ]
        with dedup.HashesCollector(field="text",
                                   output=tmp("hashes.bin")) as d:
            list(d.map(documents))

        results = FlatHashSet()
        results.load_np(tmp("hashes.bin"))
        expected = set(
            str_hash(l)
            for l in ["_hello", "_world", "i'm so original", "i'm originaler"])
        self.assertEqual(expected, set(results.keys()))