Пример #1
0
def test_fetch(http_from_disk, tmp_path: Path):
    mini_docs = [
        {
            "url": "http://sample_chinese.com",
            "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([2]),
            "bucket": "not_that_great",
        },
        {
            "url": "http://sample_english.com",
            "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([3]),
            "bucket": "top_notch",
        },
    ]
    with jsonql.open_write(tmp_path / "sample.json") as o:
        for mini in mini_docs:
            print(json.dumps(mini), file=o)

    fetcher = minify.MetadataFetcher(tmp_path)
    cc = process_wet_file.CCSegmentsReader(["crawl-data/sample.warc.wet"])
    docs = [d for d in fetcher.map(cc) if d is not None]
    assert cc.retrieved_segments == 1

    # Note: documents are retrieved as they are ordered in the .warc.wet file
    assert [
        "Facts are stubborn things, but statistics are more pliable.",
        "事實是固執的東西,但統計數字卻比較柔和。",
    ] == [d["raw_content"] for d in docs]
    assert ["top_notch", "not_that_great"] == [d["bucket"] for d in docs]
Пример #2
0
def test_minify_and_fetch(http_from_disk, tmp_path: Path):
    full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
Facts are stubborn things, but statistics are more pliable.
Fiction is obliged to stick to possibilities. Truth isn't."""
    # We don't need no education.
    chosen_quotes = "\n".join(l for l in full_quotes.splitlines()
                              if "Education" not in l)

    cc_doc = {
        "url": "http://sample_english.com",
        "date_download": "2019-03-18T00:00:00Z",
        "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
        "source_domain": "sample_english.com",
        "title": "Famous Mark Twain Quotes",
        "raw_content": full_quotes,
        "cc_segment": "crawl-data/sample.warc.wet",
        "nlines": 4,
        "length": 353,
    }

    ccnet_metadata = {
        "language": "en",
        "language_score": 0.99,
        "perplexity": 151.5,
        "bucket": "head",
        "raw_content": chosen_quotes,
        "nlines": 3,
        "length": len(chosen_quotes),
        "original_nlines": 4,
        "original_length": 353,
        "line_ids": [0, 2, 3],
    }
    ccnet_doc = dict(cc_doc, **ccnet_metadata)
    mini = minify.Minifier()(ccnet_doc.copy())
    assert mini is not ccnet_doc

    important_fields = [
        "url",
        "digest",
        "cc_segment",
        "language",
        "language_score",
        "perplexity",
        "bucket",
        "line_ids",
    ]
    expected = {k: ccnet_doc[k] for k in important_fields}
    expected["line_ids"] = encode_line_ids(
        expected["line_ids"])  # type: ignore
    assert expected == mini

    with jsonql.open_write(tmp_path / "sample.json") as o:
        print(json.dumps(mini), file=o)
    fetcher = minify.MetadataFetcher(tmp_path)
    # line_ids is removed when unminifying
    ccnet_doc.pop("line_ids")
    assert ccnet_doc == fetcher(cc_doc)
Пример #3
0
def _mine_shard(conf: Config, hashes: List[Path], shard: int,
                output: Path) -> str:
    assert conf.pipeline
    tmp_output = tmp(output)
    if "hashes" in conf.experiments:
        # HACK: used for generating paper figures
        hashes_in_mem = shard
        hashes = hashes[:HASHES_IN_MEM[hashes_in_mem]]
        shard = 0
    cc_shard = conf.get_cc_shard(shard)

    steps: Dict[str, Optional[jsonql.Transformer]] = {}
    lang_id = Path("bin") / "lid.bin"
    steps["lid_before_dedup"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="lid_before_dedup",
        top=5)
    steps["dedup"] = dedup.DuplicatesRemover(field="raw_content",
                                             hashes_files=hashes)

    steps["lid"] = split_by_lang.Classifier(
        model=lang_id,
        field="raw_content",
        out_field="language",
        top=1,
        threshold=conf.lang_threshold,
    )
    steps["lid_after_dedup"] = split_by_lang.Classifier(
        model=lang_id, field="raw_content", out_field="lid_after_dedup", top=5)

    if conf.lang_blacklist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") not in set(conf.lang_blacklist)])
    elif conf.lang_whitelist:
        steps["keep_lang"] = jsonql.where(
            [lambda doc: doc.get("language") in set(conf.lang_whitelist)])
    else:
        steps["keep_lang"] = None

    tok_field = "tokenized"
    steps["sp"] = perplexity.MultiSentencePiece(
        {l: conf.lm_dir / f"{l}.sp.model"
         for l in conf.get_lm_languages()},
        field="raw_content",
        output_field=tok_field,
        normalize=True,
    )
    steps["lm"] = perplexity.DocLM(
        {l: conf.lm_dir / f"{l}.arpa.bin"
         for l in conf.get_lm_languages()},
        field=tok_field,
        output_field="perplexity",
        normalize=False,  # Normalization is done before SentencePiece
        # load_method=kenlm.LoadMethod.PARALLEL_READ,
    )
    steps["pp_bucket"] = perplexity.PerplexityBucket(CUTOFF_CSV)
    steps["drop"] = perplexity.DropKeys(tok_field)

    steps["keep_bucket"] = None
    if conf.keep_bucket:
        steps["keep_bucket"] = jsonql.where(
            [lambda doc: doc.get("bucket", "all") in conf.keep_bucket])

    if "fetch_metadata" in conf.pipeline:
        # TODO: better default
        assert conf.metadata is not None
        steps["fetch_metadata"] = minify.MetadataFetcher(
            f"{conf.metadata}/{conf.dump}/")

    steps["minify"] = minify.Minifier()

    pattern = str(tmp_output / "{language}_{bucket}.json.gz")
    steps["split_by_lang"] = jsonql.split(pattern=str(pattern), mkdir=True)

    steps["split_by_segment"] = jsonql.split(
        split_fn=lambda doc: _get_segment(tmp_output, doc), mkdir=True)

    pipeline = filter(None, (steps[s] for s in conf.pipeline))

    jsonql.run_pipes(
        *pipeline,
        inputs=cc_shard,
        processes=conf.mine_num_processes,
        chunksize=100,
        # The splitter takes care of writing to files.
        output=tmp_output if not conf.will_split else None,
    )
    finalize(tmp_output, output)
    return f"Mined {output}"