def load( self, iso_lang_map: Dict[str, str], min_len: int = 25, ) -> List[Tuple[str, str]]: """ Args: iso_lang_map min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ data = [] # we'll combine train/test from individual datasets # and instead split on the full, aggregated dataset for subset in ("train", "test"): text_lines = tio.read_text( self.data_dir.joinpath(f"x_{subset}.txt"), lines=True ) lang_lines = tio.read_text( self.data_dir.joinpath(f"y_{subset}.txt"), lines=True ) texts = (line.strip() for line in text_lines) langs = (line.strip() for line in lang_lines) data.extend( (text, iso_lang_map[lang]) for text, lang in zip(texts, langs) if lang in iso_lang_map and itertoolz.count(char for char in text if char.isalnum()) >= min_len ) LOGGER.info("loaded Wili2018Dataset data:\n%s ...", data[:3]) return data
def test_read_write_unicode(self, tmpdir): expected = TEXT for ext in (".txt", ".gz", ".bz2", ".xz"): filepath = str(tmpdir.join("test_read_write_file_unicode" + ext)) io.write_text(expected, filepath, mode="wt", make_dirs=True) observed = next(io.read_text(filepath, mode="rt")) assert observed == expected
def test_read_write_bytes(self, tmpdir): expected = utils.to_bytes(TEXT) for ext in (".txt", ".gz", ".bz2", ".xz"): filepath = str(tmpdir.join("test_read_write_file_bytes" + ext)) io.write_text(expected, filepath, mode="wb", make_dirs=True) observed = next(io.read_text(filepath, mode="rb")) assert observed == expected
def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]: """ Args: langs min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ data = [] fstubs = [ "dslcc3/train/task1-train.txt", "dslcc3/train/task1-dev.txt", "dslcc4/DSL-TRAIN.txt", "dslcc4/DSL-DEV.txt", ] for fstub in fstubs: filepath = self.data_dir.joinpath(fstub) lines = tio.read_text(filepath, mode="rt", encoding="utf-8", lines=True) for line in lines: if not line.strip(): continue try: text, lang = line.split("\t") if ( lang[:2] in langs and itertoolz.count(c for c in text if c.isalnum()) >= min_len ): data.append((text, lang[:2])) except Exception: LOGGER.debug("bad line in data") pass data = sorted(set(data), key=operator.itemgetter(1)) LOGGER.info("loaded DSLCCDataset data:\n%s ...", data[:3]) return data
def test_read_write_unicode(self, tmpdir): expected = TEXT for ext in (".txt", ".gz", ".bz2", ".xz"): filepath = str(tmpdir.join("test_read_write_file_unicode" + ext)) if compat.PY2 is True and ext != ".txt": with pytest.raises(ValueError): io.open_sesame(filepath, mode="wt", encoding="utf-8", make_dirs=True) else: io.write_text(expected, filepath, mode="wt", make_dirs=True) observed = next(io.read_text(filepath, mode="rt")) assert observed == expected
def test_read_write_text_unicode(tmpdir): expected = TEXT for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = str(tmpdir.join('test_read_write_file_unicode' + ext)) if compat.is_python2 is True and ext != '.txt': with pytest.raises(ValueError): io.open_sesame( filename, mode='wt', encoding='utf-8', make_dirs=True) else: io.write_text(expected, filename, mode='wt', make_dirs=True) observed = next(io.read_text(filename, mode='rt')) assert observed == expected
def test_read_write_unicode_lines(self, tmpdir, spacy_doc): expected = [sent.text for sent in spacy_doc.sents] for ext in (".txt", ".gz", ".bz2", ".xz"): filepath = str(tmpdir.join("test_read_write_file_lines_unicode" + ext)) if compat.PY2 is True and ext != ".txt": with pytest.raises(ValueError): io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True) else: io.write_text(expected, filepath, mode="wt", make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filepath, mode="rt", lines=True) ] assert observed == expected
def test_read_write_text_lines_bytes(tmpdir, spacy_doc): expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents] for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext)) if compat.is_python2 is True and ext == '.xz': with pytest.raises(ValueError): io.open_sesame( filename, mode='wb', encoding='utf-8', make_dirs=True) else: io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filename, mode='rb', lines=True)] assert observed == expected
def test_read_write_text_bytes(tmpdir): expected = compat.unicode_to_bytes(TEXT) for ext in (".txt", ".gz", ".bz2", ".xz"): filename = str(tmpdir.join("test_read_write_file_bytes" + ext)) if compat.is_python2 is True and ext == ".xz": with pytest.raises(ValueError): io.open_sesame(filename, mode="wb", encoding="utf-8", make_dirs=True) else: io.write_text(expected, filename, mode="wb", make_dirs=True) observed = next(io.read_text(filename, mode="rb")) assert observed == expected
def test_read_write_unicode_lines(self, tmpdir, spacy_doc): expected = [sent.text for sent in spacy_doc.sents] for ext in (".txt", ".gz", ".bz2", ".xz"): filepath = str( tmpdir.join("test_read_write_file_lines_unicode" + ext)) io.write_text(expected, filepath, mode="wt", make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filepath, mode="rt", lines=True) ] assert observed == expected
def test_read_write_text_lines_bytes(tmpdir, spacy_doc): expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents] for ext in (".txt", ".gz", ".bz2", ".xz"): filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext)) if compat.is_python2 is True and ext == ".xz": with pytest.raises(ValueError): io.open_sesame(filename, mode="wb", encoding="utf-8", make_dirs=True) else: io.write_text(expected, filename, mode="wb", make_dirs=True, lines=True) observed = [ line.strip() for line in io.read_text(filename, mode="rb", lines=True) ] assert observed == expected