示例#1
0
def make_corpus(file: Path,
                tags_file: Path = None,
                output: Path = None) -> None:
    """
    Loads a tags file and create a training dataset using the given webpages.

    Arguments:
        - file: CC shard file
        - tags_file: dmoz tagging file, (like the one produced by `dl`)
        - output: ""
    """
    url2tags = load_tags(tags_file)
    with jsonql.open_write(output) as o:
        for document in jsonql.read_jsons(file):
            if not document:
                continue
            url = document["url"]
            domain = document["source_domain"]

            if url in url2tags:
                tags = url2tags[url]
            elif domain in url2tags:
                tags = url2tags[domain]
            else:
                continue

            if len(tags) == 0:
                continue

            fasttext_tags = ["__label__" + tag for tag in tags]
            content = document["tokenized"].replace("\n", " ").lower()
            if len(content) > 200:
                print(" ".join(fasttext_tags), content, file=o)  # type: ignore
示例#2
0
def test_fetch(http_from_disk, tmp_path: Path):
    mini_docs = [
        {
            "url": "http://sample_chinese.com",
            "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([2]),
            "bucket": "not_that_great",
        },
        {
            "url": "http://sample_english.com",
            "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
            "cc_segment": "crawl-data/sample.warc.wet",
            "line_ids": encode_line_ids([3]),
            "bucket": "top_notch",
        },
    ]
    with jsonql.open_write(tmp_path / "sample.json") as o:
        for mini in mini_docs:
            print(json.dumps(mini), file=o)

    fetcher = minify.MetadataFetcher(tmp_path)
    cc = process_wet_file.CCSegmentsReader(["crawl-data/sample.warc.wet"])
    docs = [d for d in fetcher.map(cc) if d is not None]
    assert cc.retrieved_segments == 1

    # Note: documents are retrieved as they are ordered in the .warc.wet file
    assert [
        "Facts are stubborn things, but statistics are more pliable.",
        "事實是固執的東西,但統計數字卻比較柔和。",
    ] == [d["raw_content"] for d in docs]
    assert ["top_notch", "not_that_great"] == [d["bucket"] for d in docs]
示例#3
0
def test_minify_and_fetch(http_from_disk, tmp_path: Path):
    full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
Facts are stubborn things, but statistics are more pliable.
Fiction is obliged to stick to possibilities. Truth isn't."""
    # We don't need no education.
    chosen_quotes = "\n".join(l for l in full_quotes.splitlines()
                              if "Education" not in l)

    cc_doc = {
        "url": "http://sample_english.com",
        "date_download": "2019-03-18T00:00:00Z",
        "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER",
        "source_domain": "sample_english.com",
        "title": "Famous Mark Twain Quotes",
        "raw_content": full_quotes,
        "cc_segment": "crawl-data/sample.warc.wet",
        "nlines": 4,
        "length": 353,
    }

    ccnet_metadata = {
        "language": "en",
        "language_score": 0.99,
        "perplexity": 151.5,
        "bucket": "head",
        "raw_content": chosen_quotes,
        "nlines": 3,
        "length": len(chosen_quotes),
        "original_nlines": 4,
        "original_length": 353,
        "line_ids": [0, 2, 3],
    }
    ccnet_doc = dict(cc_doc, **ccnet_metadata)
    mini = minify.Minifier()(ccnet_doc.copy())
    assert mini is not ccnet_doc

    important_fields = [
        "url",
        "digest",
        "cc_segment",
        "language",
        "language_score",
        "perplexity",
        "bucket",
        "line_ids",
    ]
    expected = {k: ccnet_doc[k] for k in important_fields}
    expected["line_ids"] = encode_line_ids(
        expected["line_ids"])  # type: ignore
    assert expected == mini

    with jsonql.open_write(tmp_path / "sample.json") as o:
        print(json.dumps(mini), file=o)
    fetcher = minify.MetadataFetcher(tmp_path)
    # line_ids is removed when unminifying
    ccnet_doc.pop("line_ids")
    assert ccnet_doc == fetcher(cc_doc)
示例#4
0
def test_open_read_write(tmp_path: Path):
    def _lines(filename: Path) -> Sequence[str]:
        # jsonql.lines calls open_read
        return list(jsonql.lines(filename))

    tmp = tmp_path
    with jsonql.open_write(tmp / "a.txt") as o:
        print("a", file=o)
    assert _lines(tmp / "a.txt") == ["a"]

    jsonql.write_jsons([{"a": 1}], tmp / "a.txt")
    assert _lines(tmp / "a.txt") == ['{"a": 1}']

    with jsonql.open_write(tmp / "a.gz") as o:
        print("a", file=o)
    assert _lines(tmp / "a.gz") == ["a"]

    with jsonql.open_write([tmp / "a0.txt", tmp / "a1.txt"]) as o:
        print("a", file=o)
    assert _lines(tmp / "a0.txt") == ["a"]
    assert not (tmp / "a1.txt").is_file()

    with jsonql.open_write([tmp / "b0.txt", tmp / "b1.txt"],
                           max_size="1k") as o:
        print("0" * 2000, file=o)
        print("1" * 2000, file=o)
    assert _lines(tmp / "b0.txt") == ["0" * 2000]
    assert _lines(tmp / "b1.txt") == ["1" * 2000]

    with jsonql.open_write(tmp / "a_????.json") as o:
        print("a", file=o)
    assert _lines(tmp / "a_0000.json") == ["a"]
    assert not (tmp / "a_0001.json").is_file()
    assert _lines(tmp / "a_*.json") == ["a"]

    with jsonql.open_write(tmp / "b_??.json", max_size="1k") as o:
        print("0" * 2000, file=o)
        print("1" * 2000, file=o)
    assert _lines(tmp / "b_00.json") == ["0" * 2000]
    assert _lines(tmp / "b_01.json") == ["1" * 2000]
    assert _lines(tmp / "b_*.json") == ["0" * 2000, "1" * 2000]
示例#5
0
 def __init__(self, file: Path):
     self.file = file
     self.tmp_file = _tmp(file)
     # We don't want to make FileWriterWithTmp a ContextManager
     self.handle = open_write(self.tmp_file).__enter__()