Exemplo n.º 1
0
def _hashes_shard(conf: Config, shard: int, output: Path):
    tmp_output = tmp(output)
    jsonql.run_pipes(
        dedup.HashesCollector(field="raw_content", output=tmp_output),
        inputs=conf.get_cc_shard(shard),
    )
    finalize(tmp_output, output)
    return f"Hashed {output}"
Exemplo n.º 2
0
def test_dedup_with_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    collector = dedup.HashesCollector(field="text", output=hashes)
    list(collector.map(documents))
    results = load_hashes(hashes)
    expected = {
        str_hash(l): l.startswith("_")
        for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    }
    assert expected == results
Exemplo n.º 3
0
def test_dedup_with_np_dump(tmp_path: Path):
    hashes = tmp_path / "hashes.bin"
    documents = [
        dict(text=text("_Hello", "_World", "I'm so original")),
        dict(text=text("_world", "I'm originaler", "_Hello")),
    ]
    with dedup.HashesCollector(field="text", output=hashes) as d:
        list(d.map(documents))

    results = FlatHashSet()
    results.load_np(hashes)
    expected = set(
        str_hash(l) for l in ["_hello", "_world", "i'm so original", "i'm originaler"]
    )
    assert expected == set(results.keys())
Exemplo n.º 4
0
def test_dedup_fast(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)
    parts = [data / "part_0.json", data / "part_1.json"]

    res = tmp_path / "res"
    res.mkdir()
    h = tmp_path / "hashes.bin"
    field = "text"
    jsonql.run_pipes(dedup.HashesCollector(field, output=h), file=parts)
    for part in parts:
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(
            text=text("Good morning", "I'm originaler"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)

    words = [w for part in [part_0, part_1] for doc in part for w in doc]
    expected = {str_hash(s.lower()): s.startswith("_") for s in words}
    assert expected == load_hashes(h)
Exemplo n.º 5
0
    def test_dedup_with_np_dump(self):
        tmp = self.get_tmpdir()

        documents = [
            dict(text=text("_Hello", "_World", "I'm so original")),
            dict(text=text("_world", "I'm originaler", "_Hello")),
        ]
        with dedup.HashesCollector(field="text",
                                   output=tmp("hashes.bin")) as d:
            list(d.map(documents))

        results = FlatHashSet()
        results.load_np(tmp("hashes.bin"))
        expected = set(
            str_hash(l)
            for l in ["_hello", "_world", "i'm so original", "i'm originaler"])
        self.assertEqual(expected, set(results.keys()))