Пример #1
0
 def test_hash3(self):
     s1 = from_items(1, 2, 3, 4, 5)
     s2 = s1 | split()
     s3 = s1 | select("key")
     self.assertNotEqual(s1.hash, s2.hash)
     self.assertNotEqual(s1.hash, s3.hash)
     self.assertNotEqual(s2.hash, s3.hash)
Пример #2
0
    def test_lines_splits(self):
        s = lines(Path(__file__).parent / "sample_text.txt")
        s2 = from_items(Path(__file__).parent / "sample_text.txt") | lines
        s3 = from_items(
            Path(__file__).parent / "sample_text.txt") | lines | flatten
        self.assertEqual(4, len(s))
        self.assertEqual("hello 10", s[0])
        self.assertEqual(list(s), list(s2[0]))
        self.assertEqual(list(s), list(s3))

        s = s | split()
        self.assertEqual(4, len(s))
        self.assertEqual(["hello", "10"], s[0])

        s |= (None, int)
        self.assertEqual(4, len(s))
        self.assertEqual(("hello", 10), s[0])
Пример #3
0
zipped = ja * en

assert len(zipped) == len(ja)

for data in zipped:
    assert isinstance(data, tuple)
    assert len(data) == 2
    j, e = data
    assert isinstance(j, str)
    assert isinstance(e, str)
    break

v = VocabBuilder("ja")
ja >> v
ja |= split() | v.numericalizer
en |= split()
dataset = ja * en | to_dict("ja", "en")

for example in dataset:
    assert isinstance(example, dict)
    assert "ja" in example
    value = example["ja"]
    assert isinstance(value, list)
    assert isinstance(value[0], int), "converted to word index(numericalize)"
    assert isinstance(example["en"], list)
    assert isinstance(example["en"][0], str), "disable numericalize"

special_delimiter_text = lines("data/special_delimiter.txt") | split("|||")
ja = special_delimiter_text | select(3)
en = special_delimiter_text | select(4)
Пример #4
0
zipped = ja * en
assert len(zipped) == len(ja)

for data in zipped:
    assert isinstance(data, tuple)
    assert len(data) == 2
    j, e = data
    assert isinstance(j, str)
    assert isinstance(e, str)
    break

dataset = ja * en | mapped(lambda t: {"ja": t[0], "en": t[1]})
for example in dataset:
    assert isinstance(example, dict)
    assert "ja" in example
    assert isinstance(example["ja"], str)
    assert isinstance(example["en"], str)

special_delimiter_text = lines("data/special_delimiter.txt") | split("|||")
for third_column in special_delimiter_text | select(3):
    assert isinstance(third_column, str)
    break
dataset = special_delimiter_text | select(3)

for japanese_column in dataset:
    pass
for japanese_column in dataset:
    assert isinstance(japanese_column, str)
    assert "現在" in japanese_column
    break
Пример #5
0
import pathlib

from flowder.pipes import split, select
from flowder.source.base import mapped
from flowder.utils import lines

ls = lines("data/kftt.ja")
assert len(ls) == 10, "there should be 10 lines"

for s in ls:
    assert isinstance(s, str), "Source iterate the raw values"
    break

for s in ls | mapped(lambda x: len(x)):
    assert isinstance(s, int), "Source iterate the raw values"
    break

for spl in ls | split():
    assert isinstance(spl, list)
    assert isinstance(spl[0], str)
    break

delimiter = "|||"
special_delimiter_text = lines("data/special_delimiter.txt") | split(delimiter)
for third_column in special_delimiter_text | select(3):
    assert isinstance(third_column, str)
    break