def test_convert_from_bool(): assert list(tubes.Each([True, False]).to(bool).to(int)) == [1, 0] assert list(tubes.Each([True, False]).to(bool).to(float)) == [1., 0.] assert list(tubes.Each([True, False]).to(bool).to(bytes)) == [b'True', b'False'] assert list(tubes.Each([True, False]).to(bool).to(str)) == ['True', 'False']
def test_reading_nota_fileobj(): buf1 = BytesIO(b"Mary had") buf2 = "string" tube = tubes.Each([buf1, buf2]).read_fileobj() with pytest.raises(ValueError) as exc: list(tube) assert exc.match(r'only accepts objects with a \.read\(\)')
def test_fuzz_tsv(seed): random.seed(seed) n_rows = random.randint(30) cols_to_read = get_cols(32) tsv_rows = [] expected_rows = [] for _ in range(n_rows): tsv_row = [] expected_row = (['xx'] * len(cols_to_read)) for col_no in range(random.randint(30)): data = '\t' while '\t' in data: data = rand_chars() if col_no in cols_to_read: expected_row[cols_to_read.index(col_no)] = data data = data.encode("utf8") tsv_row.append(data) if len(tsv_row) == 0: if 0 in cols_to_read: expected_row[cols_to_read.index(0)] = '' expected_rows.append(tuple(expected_row)) tsv_rows.append(b'\t'.join(tsv_row)) slot_tube = tubes.Each(tsv_rows).to(tubes.TsvRow).multi(lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read]) actual_rows = list(slot_tube) for row_num in range(len(expected_rows)): for col_num in range(len(cols_to_read)): expected = expected_rows[row_num][col_num] if len(cols_to_read) == 1: actual = actual_rows[row_num] else: actual = actual_rows[row_num][col_num] assert expected == actual assert len(expected_rows) == len(actual_rows)
def test_to_py_handles_refcount_iter(): flag = Flag() a = Canary(flag) assert sys.getrefcount(a) == 2 # a tube = tubes.Each(iter([True, a])).to_py() assert sys.getrefcount(tube) == 2 # iter assert sys.getrefcount(a) == 3 # a + each_val it = iter(tube) gc.collect() assert sys.getrefcount(tube) == 2 # iter() doesn't keep reference to tube assert sys.getrefcount(a) == 3 # a + each_val assert next(it) is True val = next(it) assert val is a gc.collect() assert sys.getrefcount(a) == 6 # a + each_val + val + iter_cur + topy_cur del it gc.collect() assert sys.getrefcount(a) == 4 # a + each_val + val del tube gc.collect() assert sys.getrefcount(a) == 3 # a + val del val assert sys.getrefcount(a) == 2 # a del a gc.collect() assert flag.is_set
def test_to_py_handles_refcount_list(): """ sys.getrefcount() value is always one higher than expected because the call to getrefcount() itself needs a reference.. """ flag = Flag() a = Canary(flag) assert sys.getrefcount(a) == 2 # a tube = tubes.Each([True, a]).to_py() assert sys.getrefcount(tube) == 2 # tube assert sys.getrefcount(a) == 3 # a + each_val it = iter(tube) gc.collect() assert sys.getrefcount(tube) == 2 # iter() doesn't keep reference to tube assert sys.getrefcount(a) == 3 # a + each_val assert next(it) is True val = next(it) assert val is a gc.collect() assert sys.getrefcount(a) == 6 # a + each_val + val + iter_cur + topy_cur del it gc.collect() assert sys.getrefcount(a) == 4 # a + each_val + val del tube gc.collect() assert sys.getrefcount(a) == 3 # a + val del val assert sys.getrefcount(a) == 2 # a del a gc.collect() assert flag.is_set
def test_multi_index_get_on_json_value(): tube = tubes.Each(["[1,2,3]", "[8,9,10]", '["a", "b", "c"]']).json().multi(lambda x: ( x.get(0), x.get(2), x.get(1), )) assert list(tube) == [(1, 3, 2), (8, 10, 9), ('a', 'c', 'b')]
def test_mixed_types(): table = (tubes.Each( ['apple', 'banana', 'apple']).to(str).enumerate().multi( lambda x: (x.slot(0), x.slot(0).to(float), x.slot(1))).to_pyarrow( ('index', 'index_double', 'val'))) assert isinstance(table, pa.Table) assert str(table.columns[0].type) == 'int64' assert str(table.columns[1].type) == 'double' assert str(table.columns[2].type) == 'string' assert table.to_pandas().to_dict() == { 'index': { 0: 0, 1: 1, 2: 2 }, 'index_double': { 0: 0., 1: 1., 2: 2. }, 'val': { 0: 'apple', 1: 'banana', 2: 'apple' } }
def test_reading_two_files_small_buffer(): buf1 = BytesIO(b"Mary had") buf2 = BytesIO(b'a little lamb') tube = tubes.Each([buf1, buf2]).read_fileobj(size=2).to(str) assert list(tube) == [ 'Ma', 'ry', ' h', 'ad', 'a ', 'li', 'tt', 'le', ' l', 'am', 'b' ]
def test_reading_unicode(): buf1 = BytesIO(b"Mary had") buf2 = StringIO("string") tube = tubes.Each([buf1, buf2]).read_fileobj() with pytest.raises(ValueError) as exc: list(tube) assert exc.match('expects binary')
def test_fuzz_random_double_to_str(seed, maker): numpy.random.seed(seed) array = maker(10240) actual = list(tubes.Each(array).to(float).to(str)) expected = [ str(x).replace('e-0', 'e-').replace('e+0', 'e+') for x in array ] assert actual == expected
def test_csv_escaping(): tube = tubes.Each(['a"x","b",""', '"d","e,f",g']).to(tubes.CsvRow).multi(lambda x: ( x.get(0, 'xx'), x.get(1, 'xx'), x.get(2, 'xx'), )) assert list(tube) == [(b'a"x"', b'b', b''), (b'd', b'e,f', b'g')]
def test_csv_quote_escaping(): tube = tubes.Each(['"a""b","""",""""""', '"c""""d",e""f']).to(tubes.CsvRow).multi(lambda x: ( x.get(0), x.get(1), x.get(2, 'x'), )) assert list(tube) == [(b'a"b', b'"', b'""'), (b'c""d', b'e""f', b'x')]
def test_multi_index_get_on_pyobj(): tube = tubes.Each([[1], [1, 1], [1, 2, 1], [1, 3, 3, 1], None]).multi(lambda x: tuple(x.get(i, 'X') for i in range(5))) assert list(tube) == [ (1, 'X', 'X', 'X', 'X'), (1, 1, 'X', 'X', 'X'), (1, 2, 1, 'X', 'X'), (1, 3, 3, 1, 'X'), (None, 'X', 'X', 'X', 'X'), ]
def test_fuzz_csv(seed, do_split): random.seed(seed) n_rows = random.randint(30) cols_to_read = get_cols(32) csv_rows = [] expected_rows = [] for _ in range(n_rows): csv_row = [] expected_row = (['xx'] * len(cols_to_read)) for col_no in range(random.randint(30)): data = '\r' while data.endswith('\r'): data = rand_chars() if col_no in cols_to_read: expected_row[cols_to_read.index(col_no)] = data data = data.encode("utf8") if b'"' in data or b'\n' in data or b',' in data or random.choice( [False, False, True]): data = csv_escape(data) csv_row.append(data) if len(csv_row) == 0: if 0 in cols_to_read: expected_row[cols_to_read.index(0)] = '' expected_rows.append(tuple(expected_row)) csv_rows.append(b",".join(csv_row)) if do_split: tube_input = [b'\n'.join(csv_rows)] if n_rows else [] slot_tube = tubes.Each(tube_input).csv(headers=False, skip_empty_rows=False) else: slot_tube = tubes.Each(csv_rows).to(tubes.CsvRow) slot_tube = slot_tube.multi( lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read]) actual_rows = list(slot_tube) for row_num in range(len(expected_rows)): for col_num in range(len(cols_to_read)): expected = expected_rows[row_num][col_num] if len(cols_to_read) == 1: actual = actual_rows[row_num] else: actual = actual_rows[row_num][col_num] assert expected == actual assert len(expected_rows) == len(actual_rows)
def test_passing_json_test_suite_cases(filename): test_path = path.join(TEST_CASE_DIR, filename) data = read_file(test_path) try: py_version = json.loads(data) except (ValueError, RecursionError): return tubes_version = tubes.Each([test_path]).map_files().json() assert list(tubes_version)[0] == py_version
def test_reading_json_with_multiple_blank_lines(): SAMPLE = """ [1, 2, 3] 9 """ values = list(tubes.Each([SAMPLE]).to(bytes).split().skip_if(tubes.is_blank).json()) assert values == [[1, 2, 3], 9]
def test_reading_json_with_blank_lines(): SAMPLE = """{} [1, 2, 3] {"a": 2, "b": "c"} 9 """ values = list(tubes.Each([SAMPLE]).to(bytes).split().skip_if(lambda x: x.is_blank()).json()) assert values == [{}, [1, 2, 3], {'a': 2, 'b': 'c'}, 9]
def test_escaped_multi_index_get_on_json(): tube = tubes.Each([ r'["\t","\b","\u1234"]', r'["\"","","a"]', r'["x", "y\ta\bb\n", "z"]' ]).json().multi(lambda x: ( x.get(0), x.get(1), x.get(2), )).to(str, str, str) assert list(tube) == [('\t', '\b', '\u1234'), ('"', '', 'a'), ('x', 'y\ta\bb\n', 'z')]
def test_fill_ndarray_mixed_type(): nd = (tubes.Each([x * 10 for x in string.ascii_lowercase]) .to(bytes) .enumerate() .ndarray(None, 5) ) assert nd.shape == (26, ) assert dict(nd.dtype.fields) == {'0': (np.dtype('int64'), 0), '1': (np.dtype('S6'), 8)} expected = [(i, (x * 5).encode('ascii')) for i, x in enumerate(string.ascii_lowercase)] assert [tuple(x) for x in nd] == expected
def test_str(): tube = tubes.Each(['a', 'b', 'c', 'd', 'e']).to(str).enumerate() table = tube.to_pyarrow(('index', 'val')) assert isinstance(table, pa.Table) assert str(table.columns[0].type) == 'int64' assert str(table.columns[1].type) == 'string' assert table.to_pandas().to_dict() == { 'index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, 'val': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'} }
def test_multi_index_pyobj_out_of_order(): order = [4, 2, 0, 3, 0, 1] tube = tubes.Each([[1], [1, 1], [1, 2, 1], [1, 3, 3, 1], None]).multi(lambda x: tuple(x.get(i, 'X') for i in order)) assert list(tube) == [ ('X', 'X', 1, 'X', 1, 'X'), ('X', 'X', 1, 'X', 1, 1), ('X', 1, 1, 'X', 1, 2), ('X', 3, 1, 1, 1, 3), ('X', 'X', None, 'X', None, 'X'), ]
def tubes_version(): x = (tubes.Each(FILES) .read_files() .split(b'\n') .skip(SKIP) .json() .skip_unless(lambda x: x.get('country_code', '""').to(tubes.Utf8).equals("GB")) .first(TAKE) .multi(make_getters) ) return list(x)
def test_recover_bad_json(): tube = tubes.Each(['[1,2]', '[', '{"a": 1}']).to(str).json() it = iter(tube) results = [] while True: try: results.append(next(it)) except ValueError as e: results.append('ERR') except StopIteration: break assert results == [[1, 2], 'ERR', {'a': 1}]
def test_csv_uneven_rows_get_many(): tube = tubes.Each(['a', 'b,c', 'd,e,', 'f,g,h']).to(tubes.CsvRow).multi(lambda x: ( x.get(0), x.get(1, 'xx'), x.get(2, 'xx'), )).to(str, str, str) assert list(tube) == [ ('a', 'xx', 'xx'), ('b', 'c', 'xx'), ('d', 'e', ''), ('f', 'g', 'h'), ]
def test_reading_csv_headers_different_orders(): tsv_1 = """a,b,c 1,2,3 4,5,6 """ tsv_2 = """c,a,b 9,7,8 12,10,11 """ tube = tubes.Each([tsv_1, tsv_2]).to(bytes).csv(headers=True).chunk( 1).multi(lambda x: (x.get('a'), x.get('b'), x.get('c'))).to( int, int, int) assert list(tube) == [(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)]
def test_recover_bad_csv(): tube = tubes.Each(['a,b\n1,2\n3,4\n"x' ]).csv().multi(lambda x: (x.get(0), x.get(1))) it = iter(tube) results = [] while True: try: results.append(next(it)) except ValueError as e: results.append('ERR') except StopIteration: break assert results == [(b'1', b'2'), (b'3', b'4'), 'ERR']
def test_tsv_uneven_rows_get_many(): tube = tubes.Each(['a', 'b\tc', 'd\te\t', 'f\tg\th']).to(tubes.TsvRow).multi(lambda x: ( x.get(0), x.get(1, 'xx'), x.get(2, 'xx'), )).to(str, str, str) assert list(tube) == [ ('a', 'xx', 'xx'), ('b', 'c', 'xx'), ('d', 'e', ''), ('f', 'g', 'h'), ]
def test_reading_tsv_headers_different_orders(): tsv_1 = """a\tb\tc 1\t2\t3 4\t5\t6 """ tsv_2 = """c\ta\tb 9\t7\t8 12\t10\t11 """ tube = tubes.Each([tsv_1, tsv_2]).to(bytes).split().tsv( headers=True, split=False).chunk(1).multi( lambda x: (x.get('a'), x.get('b'), x.get('c'))).to(int, int, int) assert list(tube) == [(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)]
def test_fuzz_tsv(seed): tsv_rows, expected_rows, cols_to_read = get_tsv(seed) slot_tube = tubes.Each(tsv_rows).to(tubes.TsvRow).multi(lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read]) actual_rows = list(slot_tube) for row_num in range(len(expected_rows)): for col_num in range(len(cols_to_read)): expected = expected_rows[row_num][col_num] if len(cols_to_read) == 1: actual = actual_rows[row_num] else: actual = actual_rows[row_num][col_num] assert expected == actual assert len(expected_rows) == len(actual_rows)
def test_recover_bad_json_with_skip(): tube = tubes.Each(['[1,2]', '[', '{"a": 1}', '12']).to(str).json().skip(2) it = iter(tube) results = [] while True: try: results.append(next(it)) except ValueError as e: results.append('ERR') except StopIteration: break # TODO: This /should/ return {"a": 1}, 12 # but rewinding the stack to the right place is hard assert results == ['ERR', 12]