예제 #1
0
 def test_to_file_write_string_arg(self):
     tmp_dir = Path(tempfile.mkdtemp())
     file_path = tmp_dir / "test.txt"
     sink = CharSink.to_file(str(file_path))
     sink.write("hello\n\nworld\n")
     source = CharSource.from_file(str(file_path))
     self.assertEqual("hello\n\nworld\n", source.read_all())
     shutil.rmtree(str(tmp_dir))
예제 #2
0
 def test_to_file_open(self):
     tmp_dir = Path(tempfile.mkdtemp())
     file_path = tmp_dir / "test.txt"
     with CharSink.to_file(file_path).open() as out:
         out.write("hello\n\nworld\n")
     source = CharSource.from_file(file_path)
     self.assertEqual("hello\n\nworld\n", source.read_all())
     shutil.rmtree(str(tmp_dir))
예제 #3
0
 def binary_from_doc_id_to_file_map(
     map_file: Union[str, Path, CharSource]
 ) -> "KeyValueSource[str, bytes]":
     if not isinstance(map_file, CharSource):
         map_file = CharSource.from_file(map_file)
     return _PathMappingBytesKeyValueSource(  # type: ignore
         read_doc_id_to_file_map(map_file)
     )
예제 #4
0
 def test_from_gzip_file(self):
     source = CharSource.from_gzipped_file(
         Path(__file__).parent / "gzip_char_source_test.txt.gz")
     self.assertEqual("Hello\nworld\n", source.read_all())
     self.assertEqual(["Hello", "world"], source.readlines())
     with source.open() as inp:
         self.assertEqual("Hello\n", inp.readline())
         self.assertEqual("world\n", inp.readline())
예제 #5
0
 def test_wrap(self):
     wrapped = CharSource.from_string("Hello\nworld")
     self.assertEqual("Hello\nworld", wrapped.read_all())
     self.assertEqual(["Hello", "world"], wrapped.readlines())
     self.assertFalse(wrapped.is_empty())
     with wrapped.open() as inp:
         self.assertEqual("Hello\n", inp.readline())
         self.assertEqual("world", inp.readline())
예제 #6
0
 def test_from_within_tgz_file(self):
     # prepare test archive
     file_path = Path(__file__).parent / "test_read_from_tar.tgz"
     path_within_tgz = "./hello/world"
     self.assertEqual(
         "hello\nworld\n",
         CharSource.from_file_in_tgz(file_path, path_within_tgz,
                                     "utf-8").read_all(),
     )
예제 #7
0
 def test_from_file(self):
     source = CharSource.from_file(
         Path(__file__).parent / "char_source_test.txt")
     self.assertEqual("Hello\nworld\n", source.read_all())
     self.assertEqual(["Hello", "world"], source.readlines())
     self.assertFalse(source.is_empty())
     with source.open() as inp:
         self.assertEqual("Hello\n", inp.readline())
         self.assertEqual("world\n", inp.readline())
예제 #8
0
    def test_read_write_doc_id_to_file_map(self):
        mapping = ImmutableDict.of([("foo", Path("/home/foo")),
                                    ("bar", Path("/home/bar"))])
        string_sink = CharSink.to_string()
        write_doc_id_to_file_map(mapping, string_sink)
        # note the reordering because it alphabetizes the docids
        self.assertEqual("bar\t/home/bar\nfoo\t/home/foo\n",
                         string_sink.last_string_written)

        reloaded_map = read_doc_id_to_file_map(
            CharSource.from_string(string_sink.last_string_written))

        self.assertEqual(mapping, reloaded_map)
예제 #9
0
    def test_file_in_zip(self):
        tmp_dir = Path(tempfile.mkdtemp())
        zip_path = tmp_dir / "test.zip"

        ByteSink.file_in_zip(zip_path, "fred").write("foo".encode("utf-8"))
        ByteSink.file_in_zip(zip_path, "empty_file").write("".encode("utf-8"))

        with ZipFile(zip_path, "r") as zip_file:
            self.assertTrue("fred" in zip_file.namelist())
            self.assertEqual("foo".encode("utf-8"), zip_file.read("fred"))
            self.assertEqual(
                "foo",
                CharSource.from_file_in_zip(zip_file, "fred").read_all())
            self.assertTrue(
                CharSource.from_file_in_zip(zip_file, "empty_file").is_empty())

        # also test version which takes zip file path rather than zip file object
        self.assertEqual(
            "foo",
            CharSource.from_file_in_zip(zip_path, "fred").read_all())
        self.assertTrue(
            CharSource.from_file_in_zip(zip_path, "empty_file").is_empty())

        shutil.rmtree(str(tmp_dir))
예제 #10
0
파일: core.py 프로젝트: isi-vista/aida-viz
def zipfile_to_documents(
        corpus_zipfile: ZipFile,
        prefix: Optional[str]) -> List[Tuple[str, str, str, str]]:
    print(f"Reading .ltf documents in {corpus_zipfile.filename}")

    if prefix is None:
        prefix = get_root_dir_name(corpus_zipfile) or ""

    parent_children_path = _find_name_in_zip(
        corpus_zipfile, re.compile(f"{prefix}docs/parent_children.tab"))

    if not parent_children_path:
        raise RuntimeError("Archive lacks parent_children.tab")

    parent_children_tab = _read_tab_file(
        CharSource.from_file_in_zip(corpus_zipfile, parent_children_path))

    child_to_parent_map = _create_child_to_parent_map(parent_children_tab)
    child_to_lang_map = _create_child_to_lang_map(parent_children_tab)

    documents = []
    text_dir = ZipPath(corpus_zipfile, at=f"{prefix}data/ltf/")

    for source_doc_path in text_dir.iterdir():
        source_doc_zip = ZipFile(io.BytesIO(source_doc_path.read_bytes()))

        for source_info in tqdm(
                source_doc_zip.infolist(),
                desc=f"Extracting {source_doc_path.name}",
                bar_format="{l_bar}{bar:20}{r_bar}",
        ):

            doceid_path = ZipPath(source_doc_zip, at=source_info.filename)
            try:
                doceid = doceid_path.name.split(".")[0]
                doc_id = child_to_parent_map[doceid]
                lang_id = child_to_lang_map[doceid]
                raw_text = convert_ltf_to_raw_text(
                    doceid_path.read_text(encoding="utf-8"))

                documents.append((doc_id, doceid, lang_id, raw_text))

            except AttributeError:
                raise FileNotFoundError(f"Could not read from {doceid_path}.")

    return documents
예제 #11
0
파일: corpus.py 프로젝트: TonyBY/aida-viz
def get_text_docs(corpus_zipfile: ZipFile) -> ImmutableDict[str, str]:
    print(f"Reading .ltf documents in {corpus_zipfile.filename}")

    prefix = get_root_dir_name(corpus_zipfile) or ""

    parent_children_path = _find_name_in_zip(
        corpus_zipfile, re.compile(f"{prefix}docs/parent_children.tab"))

    if not parent_children_path:
        raise RuntimeError("Archive lacks parent_children.tab")

    parent_children_tab = _read_tab_file(
        CharSource.from_file_in_zip(corpus_zipfile, parent_children_path))

    child_to_parent_map = _create_child_to_parent_map(parent_children_tab)

    text_docs = {}
    text_dir = ZipPath(corpus_zipfile, at="data/ltf/")

    for source_doc_path in text_dir.iterdir():
        source_doc_zip = ZipFile(io.BytesIO(source_doc_path.read_bytes()))

        for source_info in tqdm(
                source_doc_zip.infolist(),
                desc=f"Extracting {source_doc_path.name}",
                bar_format="{l_bar}{bar:20}{r_bar}",
        ):

            doc = ZipPath(source_doc_zip, at=source_info.filename)
            try:
                doceid = doc.name.split(".")[0]
                doc_id = child_to_parent_map[doceid]
                text_docs[doc_id] = convert_ltf_to_raw_text(
                    doc.read_text(encoding="utf-8"))
            except AttributeError:
                raise FileNotFoundError(f"Could not read from {doc}.")

    return immutabledict(text_docs)
예제 #12
0
 def test_empty_gzip(self):
     source = CharSource.from_gzipped_file(
         Path(__file__).parent / "empty_gzip.txt.gz")
     self.assertTrue(source.is_empty())
     self.assertEqual("", source.read_all())
예제 #13
0
 def test_empty(self):
     empty = CharSource.from_nowhere()
     self.assertEqual("", empty.read_all())
     self.assertEqual([], empty.readlines())
     self.assertTrue(empty.is_empty())
예제 #14
0
파일: corpus.py 프로젝트: TonyBY/aida-viz
def _read_tab_file(tab_file_source: CharSource) -> pd.DataFrame:
    """Read a tab-delimited file in to a Pandas DataFrame."""
    with tab_file_source.open() as tab_file:
        return pd.read_csv(tab_file, sep="\t", encoding="utf-8")
예제 #15
0
 def __getitem__(self, key: str) -> str:
     return CharSource.from_file(self.id_to_path[key]).read_all()