Пример #1
0
    def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]:
        """
        Args:
            langs
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        data = []
        match_regex = r"ud-(train|test|dev)\.txt"
        for fpath in tio.get_filepaths(
            self.data_dir, match_regex=match_regex, recursive=True
        ):
            fname = pathlib.Path(fpath).name
            lang, _ = fname.split("_", maxsplit=1)
            if lang not in langs:
                continue

            with open(fpath, mode="rt") as f:
                text = f.read()
            if "\n" in text:
                data.extend(
                    (text_segment, lang)
                    for text_segment in re.split(r"\n+", text)
                    if len(text_segment) >= min_len
                )
            else:
                data.extend(
                    (text_segment, lang)
                    for text_segment in _randomly_segment_text(text, (50, 1000))
                    if len(text_segment) >= min_len
                )
        LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3])
        return data
Пример #2
0
 def test_get_filepaths(self):
     expected = sorted(
         os.path.join(TESTS_DIR, fname) for fname in os.listdir(TESTS_DIR)
         if os.path.isfile(os.path.join(TESTS_DIR, fname)))
     observed = sorted(
         io.get_filepaths(TESTS_DIR,
                          ignore_invisible=False,
                          recursive=False))
     assert observed == expected
Пример #3
0
 def test_get_filepaths_ignore_regex(self):
     assert (
         len(
             list(
                 io.get_filepaths(TESTS_DIR, ignore_regex="test_", ignore_invisible=True)
             )
         )
         == 0
     )
Пример #4
0
 def test_get_filepaths_match_regex(self):
     assert (len(
         list(io.get_filepaths(TESTS_DIR, match_regex="io",
                               extension=".py"))) == 1)
Пример #5
0
 def test_get_filepaths_ignore_invisible(self):
     dirpath = os.path.dirname(os.path.abspath(__file__))
     assert len(list(io.get_filepaths(
         dirpath, ignore_invisible=True))) <= len(
             list(io.get_filepaths(dirpath, ignore_invisible=False)))
Пример #6
0
 def test_get_filepaths_match_regex(self):
     result = list(
         io.get_filepaths(TESTS_DIR, match_regex="_io", extension=".py"))
     assert len(result) == 1