def test_to_file_write_string_arg(self): tmp_dir = Path(tempfile.mkdtemp()) file_path = tmp_dir / "test.txt" sink = CharSink.to_file(str(file_path)) sink.write("hello\n\nworld\n") source = CharSource.from_file(str(file_path)) self.assertEqual("hello\n\nworld\n", source.read_all()) shutil.rmtree(str(tmp_dir))
def test_to_file_open(self): tmp_dir = Path(tempfile.mkdtemp()) file_path = tmp_dir / "test.txt" with CharSink.to_file(file_path).open() as out: out.write("hello\n\nworld\n") source = CharSource.from_file(file_path) self.assertEqual("hello\n\nworld\n", source.read_all()) shutil.rmtree(str(tmp_dir))
def binary_from_doc_id_to_file_map( map_file: Union[str, Path, CharSource] ) -> "KeyValueSource[str, bytes]": if not isinstance(map_file, CharSource): map_file = CharSource.from_file(map_file) return _PathMappingBytesKeyValueSource( # type: ignore read_doc_id_to_file_map(map_file) )
def test_from_gzip_file(self): source = CharSource.from_gzipped_file( Path(__file__).parent / "gzip_char_source_test.txt.gz") self.assertEqual("Hello\nworld\n", source.read_all()) self.assertEqual(["Hello", "world"], source.readlines()) with source.open() as inp: self.assertEqual("Hello\n", inp.readline()) self.assertEqual("world\n", inp.readline())
def test_wrap(self): wrapped = CharSource.from_string("Hello\nworld") self.assertEqual("Hello\nworld", wrapped.read_all()) self.assertEqual(["Hello", "world"], wrapped.readlines()) self.assertFalse(wrapped.is_empty()) with wrapped.open() as inp: self.assertEqual("Hello\n", inp.readline()) self.assertEqual("world", inp.readline())
def test_from_within_tgz_file(self): # prepare test archive file_path = Path(__file__).parent / "test_read_from_tar.tgz" path_within_tgz = "./hello/world" self.assertEqual( "hello\nworld\n", CharSource.from_file_in_tgz(file_path, path_within_tgz, "utf-8").read_all(), )
def test_from_file(self): source = CharSource.from_file( Path(__file__).parent / "char_source_test.txt") self.assertEqual("Hello\nworld\n", source.read_all()) self.assertEqual(["Hello", "world"], source.readlines()) self.assertFalse(source.is_empty()) with source.open() as inp: self.assertEqual("Hello\n", inp.readline()) self.assertEqual("world\n", inp.readline())
def test_read_write_doc_id_to_file_map(self): mapping = ImmutableDict.of([("foo", Path("/home/foo")), ("bar", Path("/home/bar"))]) string_sink = CharSink.to_string() write_doc_id_to_file_map(mapping, string_sink) # note the reordering because it alphabetizes the docids self.assertEqual("bar\t/home/bar\nfoo\t/home/foo\n", string_sink.last_string_written) reloaded_map = read_doc_id_to_file_map( CharSource.from_string(string_sink.last_string_written)) self.assertEqual(mapping, reloaded_map)
def test_file_in_zip(self): tmp_dir = Path(tempfile.mkdtemp()) zip_path = tmp_dir / "test.zip" ByteSink.file_in_zip(zip_path, "fred").write("foo".encode("utf-8")) ByteSink.file_in_zip(zip_path, "empty_file").write("".encode("utf-8")) with ZipFile(zip_path, "r") as zip_file: self.assertTrue("fred" in zip_file.namelist()) self.assertEqual("foo".encode("utf-8"), zip_file.read("fred")) self.assertEqual( "foo", CharSource.from_file_in_zip(zip_file, "fred").read_all()) self.assertTrue( CharSource.from_file_in_zip(zip_file, "empty_file").is_empty()) # also test version which takes zip file path rather than zip file object self.assertEqual( "foo", CharSource.from_file_in_zip(zip_path, "fred").read_all()) self.assertTrue( CharSource.from_file_in_zip(zip_path, "empty_file").is_empty()) shutil.rmtree(str(tmp_dir))
def zipfile_to_documents( corpus_zipfile: ZipFile, prefix: Optional[str]) -> List[Tuple[str, str, str, str]]: print(f"Reading .ltf documents in {corpus_zipfile.filename}") if prefix is None: prefix = get_root_dir_name(corpus_zipfile) or "" parent_children_path = _find_name_in_zip( corpus_zipfile, re.compile(f"{prefix}docs/parent_children.tab")) if not parent_children_path: raise RuntimeError("Archive lacks parent_children.tab") parent_children_tab = _read_tab_file( CharSource.from_file_in_zip(corpus_zipfile, parent_children_path)) child_to_parent_map = _create_child_to_parent_map(parent_children_tab) child_to_lang_map = _create_child_to_lang_map(parent_children_tab) documents = [] text_dir = ZipPath(corpus_zipfile, at=f"{prefix}data/ltf/") for source_doc_path in text_dir.iterdir(): source_doc_zip = ZipFile(io.BytesIO(source_doc_path.read_bytes())) for source_info in tqdm( source_doc_zip.infolist(), desc=f"Extracting {source_doc_path.name}", bar_format="{l_bar}{bar:20}{r_bar}", ): doceid_path = ZipPath(source_doc_zip, at=source_info.filename) try: doceid = doceid_path.name.split(".")[0] doc_id = child_to_parent_map[doceid] lang_id = child_to_lang_map[doceid] raw_text = convert_ltf_to_raw_text( doceid_path.read_text(encoding="utf-8")) documents.append((doc_id, doceid, lang_id, raw_text)) except AttributeError: raise FileNotFoundError(f"Could not read from {doceid_path}.") return documents
def get_text_docs(corpus_zipfile: ZipFile) -> ImmutableDict[str, str]: print(f"Reading .ltf documents in {corpus_zipfile.filename}") prefix = get_root_dir_name(corpus_zipfile) or "" parent_children_path = _find_name_in_zip( corpus_zipfile, re.compile(f"{prefix}docs/parent_children.tab")) if not parent_children_path: raise RuntimeError("Archive lacks parent_children.tab") parent_children_tab = _read_tab_file( CharSource.from_file_in_zip(corpus_zipfile, parent_children_path)) child_to_parent_map = _create_child_to_parent_map(parent_children_tab) text_docs = {} text_dir = ZipPath(corpus_zipfile, at="data/ltf/") for source_doc_path in text_dir.iterdir(): source_doc_zip = ZipFile(io.BytesIO(source_doc_path.read_bytes())) for source_info in tqdm( source_doc_zip.infolist(), desc=f"Extracting {source_doc_path.name}", bar_format="{l_bar}{bar:20}{r_bar}", ): doc = ZipPath(source_doc_zip, at=source_info.filename) try: doceid = doc.name.split(".")[0] doc_id = child_to_parent_map[doceid] text_docs[doc_id] = convert_ltf_to_raw_text( doc.read_text(encoding="utf-8")) except AttributeError: raise FileNotFoundError(f"Could not read from {doc}.") return immutabledict(text_docs)
def test_empty_gzip(self): source = CharSource.from_gzipped_file( Path(__file__).parent / "empty_gzip.txt.gz") self.assertTrue(source.is_empty()) self.assertEqual("", source.read_all())
def test_empty(self): empty = CharSource.from_nowhere() self.assertEqual("", empty.read_all()) self.assertEqual([], empty.readlines()) self.assertTrue(empty.is_empty())
def _read_tab_file(tab_file_source: CharSource) -> pd.DataFrame: """Read a tab-delimited file in to a Pandas DataFrame.""" with tab_file_source.open() as tab_file: return pd.read_csv(tab_file, sep="\t", encoding="utf-8")
def __getitem__(self, key: str) -> str: return CharSource.from_file(self.id_to_path[key]).read_all()