示例#1
0
def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False):
    n_shards = 4
    n_docs = 20
    shards = [[
        dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)
    ] for s in range(n_shards)]
    shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)]
    for shard, shard_file in zip(shards, shards_files):
        jsonql.run_pipes(inputs=shard, output=shard_file)
    regroup_file = tmp_path / "regroup.json.gz"
    start = time.time()
    regroup_fn(shards_files, regroup_file)
    duration = time.time() - start
    print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s")

    regrouped = list(jsonql.read_jsons(regroup_file))
    assert [doc for shard in shards for doc in shard] == regrouped

    readers = jsonql.get_block_readers(regroup_file, n_shards)
    if not check_blocks_boundaries:
        assert [doc for shard in shards for doc in shard] == [
            doc for reader in readers for doc in jsonql.read_jsons(reader)
        ]
        return

    for shard, reader in zip(shards, readers):
        block = [doc for doc in jsonql.read_jsons(reader)]
        assert shard == block
示例#2
0
def test_remove_duplicates_sharded(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["_Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)

    h = tmp_path / "hashes"
    h.mkdir()
    h0 = FlatHashSet()
    h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
    h0.add([str_hash("_world")])
    h0.dump(h / "part_0.bin")
    assert {
        str_hash("hello"): False,
        str_hash("_world"): True,
        str_hash("i'm so original"): False,
    } == as_dict(h0)

    h1 = FlatHashSet()
    h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
    h1.add([str_hash("_good morning")])
    h1.dump(h / "part_1.bin")
    assert {
        str_hash("_good morning"): True,
        str_hash("_world"): False,
        str_hash("i'm originaler"): False,
    } == as_dict(h1)

    res = tmp_path / "res"
    res.mkdir()
    # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
    dedup.remove_duplicates_sharded(
        files=[data / "part_0.json", data / "part_1.json"],
        outputs=[res / "part_0.json", res / "part_1.json"],
        field="text",
        hashes_dir=h,
    )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    # First pass removes "_world", second "_good morning".
    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(text=text("I'm originaler"), original_nlines=3, nlines=1, line_ids=[2])
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)
示例#3
0
def test_split(tmp_path: Path):
    data = [
        dict(text="Hello world", lang="en"),
        dict(text="Boujour les amis", lang="fr"),
        dict(text="Rock your boat", lang="en"),
    ]
    with jsonql.split(tmp_path / "{lang}.json") as split:
        list(split.map(data))
        summary = split.summary()
    assert "Found 2 splits." in summary
    en_docs = list(jsonql.read_jsons(tmp_path / "en.json"))
    assert [data[0], data[2]] == en_docs

    fr_docs = list(jsonql.read_jsons(tmp_path / "fr.json"))
    assert [data[1]] == fr_docs
示例#4
0
def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None:
    """
    Loads a tags file and create a training dataset using the given webpages.

    Arguments:
        - file: CC shard file
        - tags_file: dmoz tagging file, (like the one produced by `dl`)
        - output: ""
    """
    url2tags = load_tags(tags_file)
    with jsonql.smart_open(file) as f, jsonql.smart_open(output, "w") as o:
        for document in jsonql.read_jsons(f):
            if not document:
                continue
            url = document["url"]
            domain = document["source_domain"]

            if url in url2tags:
                tags = url2tags[url]
            elif domain in url2tags:
                tags = url2tags[domain]
            else:
                continue

            if len(tags) == 0:
                continue

            fasttext_tags = ["__label__" + tag for tag in tags]
            content = document["tokenized"].replace("\n", " ").lower()
            if len(content) > 200:
                print(" ".join(fasttext_tags), content, file=o)  # type: ignore
示例#5
0
def test_dedup_fast(tmp_path: Path):
    data = tmp_path / "data"
    part_0 = [["Hello", "_World", "I'm so original"]]
    write_docs(data / "part_0.json", part_0)
    part_1 = [["Good morning", "_World", "I'm originaler"]]
    write_docs(data / "part_1.json", part_1)
    parts = [data / "part_0.json", data / "part_1.json"]

    res = tmp_path / "res"
    res.mkdir()
    h = tmp_path / "hashes.bin"
    field = "text"
    jsonql.run_pipes(dedup.HashesCollector(field, output=h), file=parts)
    for part in parts:
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )
        jsonql.run_pipes(
            dedup.DuplicatesRemover(field, [h]), file=part, output=res / part.name
        )

    results_0 = list(jsonql.read_jsons(res / "part_0.json"))
    expected_0 = [
        dict(
            text=text("Hello", "I'm so original"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]
    assert_documents_equal(expected_0, results_0, ignoring=LENGTHS)

    results_1 = list(jsonql.read_jsons(res / "part_1.json"))
    expected_1 = [
        dict(
            text=text("Good morning", "I'm originaler"),
            original_nlines=3,
            nlines=2,
            line_ids=[0, 2],
        )
    ]

    assert_documents_equal(expected_1, results_1, ignoring=LENGTHS)

    words = [w for part in [part_0, part_1] for doc in part for w in doc]
    expected = {str_hash(s.lower()): s.startswith("_") for s in words}
    assert expected == load_hashes(h)
示例#6
0
    def test_split(self):
        tmp = self.get_tmpdir()
        data = [
            dict(text="Hello world", lang="en"),
            dict(text="Boujour les amis", lang="fr"),
            dict(text="Rock your boat", lang="en"),
        ]
        with jsonql.split(tmp("{lang}.json")) as split:
            list(split.map(data))
            summary = split.summary()
        self.assertIn("Found 2 splits.", summary)
        with open(tmp("en.json")) as f_en:
            en_docs = list(jsonql.read_jsons(f_en))
            self.assertEqual([data[0], data[2]], en_docs)

        with open(tmp("fr.json")) as f_fr:
            fr_docs = list(jsonql.read_jsons(f_fr))
            self.assertEqual([data[1]], fr_docs)
示例#7
0
文件: minify.py 项目: torshie/cc_net
def unminify_file(file: Union[Path, str], output: Path, cache_dir: Path = None):
    unminifier = Unminifier(cache_dir)
    with jsonql.smart_open(file) as f:
        mini = [m for m in jsonql.read_jsons(f)]
    unminifier.look_for(mini)

    tmp = output.with_name("tmp." + output.name)
    jsonql.run_pipes(unminifier, file=iter(mini), output=tmp)
    shutil.move(tmp, output)
    f_size = Path(file).stat().st_size if Path(file).exists() else 0
    o_size = output.stat().st_size
    mb = 1024 ** 2
    return f"Unminified {output} ({f_size // mb:_}Mb -> {o_size // mb:_}Mb)"
示例#8
0
def test_blocked_gzip(tmp_path: Path):
    file = tmp_path / "test.gz"
    f = str(file)
    # Each object is 10/11 bytes long. We have 2 of them by block.
    content = ['{"xx": %d}' % i for i in range(80)]
    with jsonql.BlockedGzipWriter(file, "wt", block_size="20B") as o:
        for line in content:
            print(line, file=o)

    jr = jsonql.JsonReader(strict=True)
    expected = list(jr.map(content))
    # read as one file
    assert expected == list(jsonql.read_jsons(file))
    # read first block
    assert expected[:2] == list(jsonql.read_jsons(f + "[0/40]"))
    # read last block
    assert expected[-2:] == list(jsonql.read_jsons(f + "[39/40]"))

    readers = jsonql.get_block_readers(file, 9)
    read_as_several_files = [list(jsonql.read_jsons(r)) for r in readers]
    # 40 splits of 2 docs, 9 readers -> 5 splits, 10 docs per reader
    assert list(jsonql.grouper(expected, 10)) == read_as_several_files
示例#9
0
def extract_opening_text(source, n_docs: int = 10_000):
    i = 0
    for doc in jsonql.read_jsons(source):
        if not doc:
            continue

        text = doc.get("opening_text")
        if not text:
            continue

        yield text_normalizer.normalize(text)
        i += 1
        if i >= n_docs:
            break
示例#10
0
文件: dedup.py 项目: jsedoc/cc_net
 def finalize(source, dedup_hashes, min_len):
     n_chars, n_chars_kept = 0, 0
     with open(dedup_hashes, "rb") as hashes:
         for doc in jsonql.read_jsons(source):
             content = doc.get(field)
             if not content or len(content) < min_len:
                 continue
             sentences = content.split("\n")
             doc_hashes = np.fromfile(hashes,
                                      dtype=HASH_TYPE,
                                      count=len(sentences))
             chars, kept_chars = finalize_doc(doc, field, doc_hashes)
             n_chars += chars
             n_chars_kept += kept_chars
             yield doc
     selectivity = n_chars_kept / n_chars if n_chars else 0
     log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%}).")
示例#11
0
文件: dedup.py 项目: jsedoc/cc_net
def _dump_sentence_hashes(source: Path, output: Path, field: str):
    treated = 0
    started = time.time()
    with open(output, "wb") as o:
        for doc in jsonql.read_jsons(source):
            content = doc.get(field)
            if not content:
                continue
            h = compute_hashes(content)
            if h is None:
                continue
            h.tofile(o)
            treated += 1
            if treated % 100_000 == 0:
                delay = time.time() - started
                log(f"Computed {treated} documents hashes in {delay / 3600:.2f}h ({treated / delay} doc / s)"
                    )
示例#12
0
文件: dedup.py 项目: jsedoc/cc_net
def deduplicate_two_pass(file: jsonql.FileDescriptor,
                         field: str = "raw_content") -> Iterable[dict]:
    """Remove duplicates of the given file (even removing the first occurence).

    This is what is done in the paper, and in mine.py
    """
    try:
        if isinstance(file, Path):
            hash_file: Path = file.with_suffix(".bin")
        else:
            hash_file = jsonql._tmp(Path("hashes.bin"))
        jsonql.run_pipes(jsonql.JsonReader(),
                         HashesCollector(field, output=hash_file),
                         file=file)
        dup_remover = DuplicatesRemover(field, [hash_file])
        return dup_remover.map(jsonql.read_jsons(file))
    finally:
        if hash_file.exists():
            hash_file.unlink()
示例#13
0
    def fetch_metadata(self, segment: str) -> None:
        meta_file = self.meta_file(segment)
        k = get_doc_key
        self.metadata = {}
        collision = 0
        for m in jsonql.read_jsons(meta_file):
            key = k(m["digest"])
            if key in self.metadata:
                collision += 1
            self.metadata[key] = m

        self.log(f"Loaded {len(self.metadata)} metadatas from {meta_file}")
        if collision > 0:
            self._logger.warning(f"Found {collision} collisions !")

        self.segment = segment
        if segment in self._segments:
            self.log("Cache miss")
            self.segments_read_twice += 1
        self._segments.add(segment)
示例#14
0
def perplexity_to_bin(file: Path, output: Path, models, tok_field: str):
    pp_field = "perplexity"
    lm = DocLM(models, tok_field, output_field=pp_field)
    stats: List[float] = []
    max_stats = 1_000_000
    batch_size = 100_000
    i = 0
    batch = []
    with jsonql.smart_open(file) as f, open(output, "wb") as o:
        for doc in jsonql.read_jsons(f):
            i += 1
            pp = lm(doc)[pp_field]
            if len(stats) < max_stats:
                stats.append(pp)
            batch.append(pp)
            if len(batch) >= batch_size:
                np.array(batch, dtype=np.float32).tofile(o)
                batch = []
        if len(batch) > 0:
            np.array(batch, dtype=np.float32).tofile(o)
示例#15
0
文件: dedup.py 项目: jsedoc/cc_net
def deduplicate(file: jsonql.ReadableFileLike,
                field: str = "raw_content") -> Iterable[dict]:
    """Remove duplicates of the given file (but keep the first occurence)."""
    dup_remover = DuplicatesRemover(field, [], collect=True)
    return dup_remover.map(jsonql.read_jsons(file))
示例#16
0
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
    """
    Download metadata from a shards.

    Sample metadata:

    {
        "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
        "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
        "url": "http://personals.gearplay.com/ads/DRJONES.htm",
        "line_ids": [10],
        "languages": ["en_XX"],
        "lm_scores": [-2.658],
    }
    """
    snapshot = snapshot.replace("-", "_")
    name = f"snap_{snapshot}_batch_{shard}.json.gz"
    url = "/".join([S3_BUCKET, VERSION, name])
    shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
    try:
        cache_file: Optional[Path] = None
        if WET_CACHE is not None:
            cache_file = WET_CACHE / name
        metadata_file = jsonql.open_remote_file(url, cache_file)
    except:
        logging.warning(f"Couldn't open {url}")
        return

    for meta in jsonql.read_jsons(metadata_file):
        shard_metadata[meta["cc_segment"]][meta["digest"]] = meta

    found_pars, missed_pars = 0, 0
    for seg, segment_metadata in shard_metadata.items():
        for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
            if doc["digest"] not in segment_metadata:
                continue

            meta = segment_metadata[doc["digest"]]
            full_pars = [doc["title"]] + doc["raw_content"].split("\n")

            assert len(meta["line_ids"]) == len(meta["languages"])
            assert len(meta["line_ids"]) == len(meta["lm_scores"])
            for i, lang, score in zip(meta["line_ids"], meta["languages"],
                                      meta["lm_scores"]):
                if snapshot != "2018-51" and lang in BIG_LANGUAGES:
                    # Big languages only come from "2018-51" snapshot
                    continue
                if i >= len(full_pars):
                    # This is because CC100 was created by saving only urls.
                    # Some urls appears in different snapshot with slightly different
                    # versions, but we don't know which one is correct.
                    # Here we read both versions, but some index may end up
                    # being incorrect.
                    # This impact ~3% documents.
                    missed_pars += 1
                    continue

                yield Paragraph(lang, full_pars[i], score)
                found_pars += 1
        if missed_pars > 0:
            logging.warning(
                f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
            )
示例#17
0
def deduplicate(source,
                field,
                hashes=None,
                output_hashes=None,
                add_hashes=True,
                finalize=True):
    """
    DOES TOO MANY THINGS
    Removes duplicate lines found in the field `field` of the source documents.

    Finds duplicate lines based on the hashes. Either hashes can be computed when
    reading the documents or they can be loaded from a binary file.

    If `add_hashes` is set to False only the given hashes will be considered.
    This grants a better control on memory footprint.
    """
    hash_field = field + "_hash"
    if isinstance(hashes, str) or isinstance(hashes, Path):
        seen = FlatHashSet()
        seen.load(hashes)
    elif hashes is not None:
        seen = hashes
    else:
        seen = FlatHashSet()
    log(f"Loaded {len(seen)} unique hashes.")
    n_doc = 0
    batch_size = 100_000
    n_lines, n_lines_kept = 0, 0
    n_chars, n_chars_kept = 0, 0
    t = time.time()

    def log_stats(start_time):
        end_time = time.time()
        speed = batch_size / (end_time - start_time)

        if add_hashes:
            log(f"Saw {len(seen)} unique hashes over {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        else:
            log(f"Processed {n_lines} lines in {n_doc} docs. [{speed:.1f} doc/s]"
                )
        max_mem = mem_footprint_gb()
        log(f"Used up to {max_mem:.1f}GB of RAM.")
        selectivity = n_lines_kept / n_lines if n_lines else 0
        log(f"Kept {n_lines_kept} lines out of {n_lines} ({selectivity:.1%}).")
        if finalize:
            selectivity = n_chars_kept / n_chars if n_chars else 0
            log(f"Kept {n_chars_kept} chars out of {n_chars} ({selectivity:.1%})."
                )

    for doc in jsonql.read_jsons(source):
        n_doc += 1
        if n_doc % batch_size == 0:
            log_stats(t)
            t = time.time()

        hashes = doc.get(hash_field) or compute_hashes(doc.get(field))
        if hashes is None:
            continue
        if isinstance(hashes, list):
            hashes = np.array(hashes, dtype=HASH_TYPE)

        duplicate = seen.__contains__(hashes)
        if add_hashes:
            seen.add(hashes, duplicate)

        keep = duplicate < 1
        kept = keep.sum()
        hashes = hashes * keep
        doc[hash_field] = list(int(x) for x in hashes)
        n_lines += keep.size
        n_lines_kept += kept
        if finalize:
            chars, kept_chars = finalize_doc(doc, field)
            n_chars += chars
            n_chars_kept += kept_chars
        if kept > 0:
            yield doc

    log_stats(t)

    if output_hashes:
        log(f"Dumping {len(seen)} hashes to {output_hashes}.")
        seen.dump(output_hashes)
示例#18
0
    def test_remove_duplicates_sharded(self):
        data = self.get_tmpdir()
        part_0 = [["Hello", "_World", "I'm so original"]]
        write_docs(data("part_0.json"), part_0)
        part_1 = [["_Good morning", "_World", "I'm originaler"]]
        write_docs(data("part_1.json"), part_1)

        h = self.get_tmpdir()
        h0 = FlatHashSet()
        h0.add([str_hash(s.lower()) for doc in part_0 for s in doc])
        h0.add([str_hash("_world")])
        h0.dump(h("part_0.bin"))
        self.assertEqual(
            {
                str_hash("hello"): False,
                str_hash("_world"): True,
                str_hash("i'm so original"): False,
            },
            as_dict(h0),
        )

        h1 = FlatHashSet()
        h1.add([str_hash(s.lower()) for doc in part_1 for s in doc])
        h1.add([str_hash("_good morning")])
        h1.dump(h("part_1.bin"))
        self.assertEqual(
            {
                str_hash("_good morning"): True,
                str_hash("_world"): False,
                str_hash("i'm originaler"): False,
            },
            as_dict(h1),
        )

        res = self.get_tmpdir()
        # dedup.DISABLE_MULTI_PROCESSING = True  # Simplifies debugging
        dedup.remove_duplicates_sharded(
            files=[data("part_0.json"),
                   data("part_1.json")],
            outputs=[res("part_0.json"),
                     res("part_1.json")],
            field="text",
            hashes_dir=h(),
        )

        with open(res("part_0.json")) as o:
            lines = o.readlines()
            print(lines)
            results_0 = list(jsonql.read_jsons(lines))
        expected_0 = [
            dict(text=text("Hello", "I'm so original"),
                 original_nlines=3,
                 nlines=2)
        ]
        assert_documents_equal(expected_0, results_0, ignoring=CUMBERSOME)

        with open(res("part_1.json")) as o:
            results_1 = [json.loads(l) for l in o.readlines()]
        # First pass removes "_world", second "_good morning".
        expected_1 = [
            dict(text=text("I'm originaler"), original_nlines=3, nlines=1)
        ]

        assert_documents_equal(expected_1, results_1, ignoring=CUMBERSOME)