def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None: """ Loads a tags file and create a training dataset using the given webpages. Arguments: - file: CC shard file - tags_file: dmoz tagging file, (like the one produced by `dl`) - output: "" """ url2tags = load_tags(tags_file) with jsonql.open_write(output) as o: for document in jsonql.read_jsons(file): if not document: continue url = document["url"] domain = document["source_domain"] if url in url2tags: tags = url2tags[url] elif domain in url2tags: tags = url2tags[domain] else: continue if len(tags) == 0: continue fasttext_tags = ["__label__" + tag for tag in tags] content = document["tokenized"].replace("\n", " ").lower() if len(content) > 200: print(" ".join(fasttext_tags), content, file=o) # type: ignore
def test_fetch(http_from_disk, tmp_path: Path): mini_docs = [ { "url": "http://sample_chinese.com", "digest": "sha1:Y4E6URVYGIAFNVRTPZ5S3J64RTZTP6HJ", "cc_segment": "crawl-data/sample.warc.wet", "line_ids": encode_line_ids([2]), "bucket": "not_that_great", }, { "url": "http://sample_english.com", "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER", "cc_segment": "crawl-data/sample.warc.wet", "line_ids": encode_line_ids([3]), "bucket": "top_notch", }, ] with jsonql.open_write(tmp_path / "sample.json") as o: for mini in mini_docs: print(json.dumps(mini), file=o) fetcher = minify.MetadataFetcher(tmp_path) cc = process_wet_file.CCSegmentsReader(["crawl-data/sample.warc.wet"]) docs = [d for d in fetcher.map(cc) if d is not None] assert cc.retrieved_segments == 1 # Note: documents are retrieved as they are ordered in the .warc.wet file assert [ "Facts are stubborn things, but statistics are more pliable.", "事實是固執的東西,但統計數字卻比較柔和。", ] == [d["raw_content"] for d in docs] assert ["top_notch", "not_that_great"] == [d["bucket"] for d in docs]
def test_minify_and_fetch(http_from_disk, tmp_path: Path): full_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live. Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge. Facts are stubborn things, but statistics are more pliable. Fiction is obliged to stick to possibilities. Truth isn't.""" # We don't need no education. chosen_quotes = "\n".join(l for l in full_quotes.splitlines() if "Education" not in l) cc_doc = { "url": "http://sample_english.com", "date_download": "2019-03-18T00:00:00Z", "digest": "sha1:XQZHW7QWIG54HVAV3KPRW6MK5ILDNCER", "source_domain": "sample_english.com", "title": "Famous Mark Twain Quotes", "raw_content": full_quotes, "cc_segment": "crawl-data/sample.warc.wet", "nlines": 4, "length": 353, } ccnet_metadata = { "language": "en", "language_score": 0.99, "perplexity": 151.5, "bucket": "head", "raw_content": chosen_quotes, "nlines": 3, "length": len(chosen_quotes), "original_nlines": 4, "original_length": 353, "line_ids": [0, 2, 3], } ccnet_doc = dict(cc_doc, **ccnet_metadata) mini = minify.Minifier()(ccnet_doc.copy()) assert mini is not ccnet_doc important_fields = [ "url", "digest", "cc_segment", "language", "language_score", "perplexity", "bucket", "line_ids", ] expected = {k: ccnet_doc[k] for k in important_fields} expected["line_ids"] = encode_line_ids( expected["line_ids"]) # type: ignore assert expected == mini with jsonql.open_write(tmp_path / "sample.json") as o: print(json.dumps(mini), file=o) fetcher = minify.MetadataFetcher(tmp_path) # line_ids is removed when unminifying ccnet_doc.pop("line_ids") assert ccnet_doc == fetcher(cc_doc)
def test_open_read_write(tmp_path: Path): def _lines(filename: Path) -> Sequence[str]: # jsonql.lines calls open_read return list(jsonql.lines(filename)) tmp = tmp_path with jsonql.open_write(tmp / "a.txt") as o: print("a", file=o) assert _lines(tmp / "a.txt") == ["a"] jsonql.write_jsons([{"a": 1}], tmp / "a.txt") assert _lines(tmp / "a.txt") == ['{"a": 1}'] with jsonql.open_write(tmp / "a.gz") as o: print("a", file=o) assert _lines(tmp / "a.gz") == ["a"] with jsonql.open_write([tmp / "a0.txt", tmp / "a1.txt"]) as o: print("a", file=o) assert _lines(tmp / "a0.txt") == ["a"] assert not (tmp / "a1.txt").is_file() with jsonql.open_write([tmp / "b0.txt", tmp / "b1.txt"], max_size="1k") as o: print("0" * 2000, file=o) print("1" * 2000, file=o) assert _lines(tmp / "b0.txt") == ["0" * 2000] assert _lines(tmp / "b1.txt") == ["1" * 2000] with jsonql.open_write(tmp / "a_????.json") as o: print("a", file=o) assert _lines(tmp / "a_0000.json") == ["a"] assert not (tmp / "a_0001.json").is_file() assert _lines(tmp / "a_*.json") == ["a"] with jsonql.open_write(tmp / "b_??.json", max_size="1k") as o: print("0" * 2000, file=o) print("1" * 2000, file=o) assert _lines(tmp / "b_00.json") == ["0" * 2000] assert _lines(tmp / "b_01.json") == ["1" * 2000] assert _lines(tmp / "b_*.json") == ["0" * 2000, "1" * 2000]
def __init__(self, file: Path): self.file = file self.tmp_file = _tmp(file) # We don't want to make FileWriterWithTmp a ContextManager self.handle = open_write(self.tmp_file).__enter__()