def test_write_wrong_complex_type(orc_type, value): data = io.BytesIO() writer = Writer(data, orc_type) with pytest.raises( (TypeError, ValueError) ): # Dict construction might raise ValueError as well. writer.write(value)
def test_attributes(schema, attrs): data = io.BytesIO() schema.set_attributes(attrs) writer = Writer(data, schema) writer.close() reader = Reader(data) assert len(reader) == 0 assert reader.schema.attributes == attrs
def test_struct_repr(): data = io.BytesIO() writer = Writer(data, "struct<a:int>") with pytest.raises(TypeError): writer.write({"a": 1}) writer = Writer(data, "struct<a:int>", struct_repr=StructRepr.DICT) with pytest.raises(TypeError): writer.write((1,)) with pytest.raises(TypeError): writer.write({"a": "b"})
def test_writerows(): data = io.BytesIO() writer = Writer(data, "int") rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) res = writer.writerows(rows) writer.close() assert res == len(rows) data.seek(0) reader = Reader(data) assert list(rows) == reader.read()
def test_open_file(): with tempfile.NamedTemporaryFile(mode="wt") as fp: with pytest.raises(ParseError): _ = Writer(fp, "int") with open(fp.name, "rb") as fp2: with pytest.raises(io.UnsupportedOperation): _ = Writer(fp2, "int") with tempfile.NamedTemporaryFile(mode="wb") as fp: writer = Writer(fp, "int") assert isinstance(writer, Writer) with pytest.raises(TypeError): _ = Writer(0, "int")
def test_open_file(output_file): output_file.close() with open(output_file.name, mode="wt") as fp: with pytest.raises(ParseError): _ = Writer(fp, "int") with open(output_file.name, "rb") as fp: with pytest.raises(io.UnsupportedOperation): _ = Writer(fp, "int") with open(output_file.name, mode="wb") as fp: writer = Writer(fp, "int") assert isinstance(writer, Writer) with pytest.raises(TypeError): _ = Writer(0, "int")
def test_next(): data = io.BytesIO() Writer(data, "struct<col0:int,col1:string>").close() with pytest.raises(StopIteration): reader = Reader(data) next(reader) expected = (0, "Test A") data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: writer.write(expected) reader = Reader(data) assert next(reader) == expected with pytest.raises(StopIteration): next(reader)
def test_schema(): schema_str = "struct<col0:int,col1:string>" data = io.BytesIO() writer = Writer(data, schema_str) assert str(writer.schema) == schema_str with pytest.raises(AttributeError): writer.schema = "fail" with pytest.raises(AttributeError): del writer.schema schema = writer.schema del writer assert isinstance(schema, TypeDescription) assert schema.kind == TypeKind.STRUCT
def test_context_manager(): data = io.BytesIO() records = [ { "col0": 1, "col1": "Test A", "col2": 2.13 }, { "col0": 2, "col1": "Test B", "col2": 0.123213 }, { "col0": 3, "col1": "Test C", "col2": 123.011234 }, ] with Writer(data, "struct<col0:int,col1:string,col2:double>", struct_repr=StructRepr.DICT) as writer: for rec in records: writer.write(rec) data.seek(0) reader = Reader(data, struct_repr=StructRepr.DICT) assert reader.read() == records
def test_open_file(output_file): output_file.close() with open(output_file.name, "wb") as fp: with pytest.raises(ParseError): _ = Reader(fp) # Write invalid bytes: fp.write(b"TESTTORC\x08\x03\x10\x03") with open(output_file.name, "rb") as fp: with pytest.raises(ParseError): _ = Reader(fp) with open(output_file.name, "wb") as fp: fp.write(b'ORC\x08\x03\x10\x03"k\x08\x0c\x12\x0c\x01\x02\x03') with open(output_file.name, "rt") as fp: with pytest.raises(ParseError): _ = Reader(fp) with open(output_file.name, "rb") as fp: with pytest.raises(ParseError): _ = Reader(fp) with open(output_file.name, "wb") as fp: Writer(fp, "struct<col0:int,col1:string>").close() with open(output_file.name, "ab") as fp: with pytest.raises(io.UnsupportedOperation): _ = Reader(fp) with open(output_file.name, "rb") as fp: reader = Reader(fp) assert reader is not None assert len(reader) == 0
def test_include(): data = io.BytesIO() record = {"col0": 1, "col1": "Test A", "col2": 3.14} with Writer(data, "struct<col0:int,col1:string,col2:double>", struct_repr=StructRepr.DICT) as writer: writer.write(record) data.seek(0) reader = Reader(data, column_indices=[0], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1} reader = Reader(data, column_indices=[0, 2], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1, "col2": 3.14} with pytest.raises(TypeError): _ = Reader(data, column_indices=[0, "2"], struct_repr=StructRepr.DICT) reader = Reader(data, column_names=["col0"], struct_repr=StructRepr.DICT) assert next(reader) == {"col0": 1} reader = Reader(data, column_names=["col1", "col2"], struct_repr=StructRepr.DICT) assert next(reader) == {"col1": "Test A", "col2": 3.14} with pytest.raises(TypeError): _ = Reader(data, column_names=["col1", 2], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_indices=[10], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_names=["col5"], struct_repr=StructRepr.DICT) with pytest.raises(ValueError): _ = Reader(data, column_names=["col1"], column_indices=[2], struct_repr=StructRepr.DICT)
def test_complex_predicate_results(): data = io.BytesIO() with Writer(data, "struct<c0:int,c1:string>", row_index_stride=100) as writer: writer.writerows((i, "A") if i > 300 and i <= 450 else (i, "B") for i in range(1000)) data.seek(0) reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") < 100) & (PredicateColumn(TypeKind.STRING, "c1") == "A"), ) assert list(reader) == [] reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") > 300) & (PredicateColumn(TypeKind.STRING, "c1") == "A"), ) result = list(reader) assert len(result) == 200 assert sum(1 if row[1] == "A" else 0 for row in result) == 150 reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") >= 400) & (PredicateColumn(TypeKind.STRING, "c1") != "A"), ) result = list(reader) assert len(result) == 600 reader = Reader( data, predicate=(PredicateColumn(TypeKind.INT, "c0") < 100) | (PredicateColumn(TypeKind.STRING, index=2) != "B"), ) result = list(reader) assert len(result) == 300
def test_open_file(): with tempfile.NamedTemporaryFile(mode="wb") as fp: with pytest.raises(ParseError): _ = Reader(fp) fp.write(b"TESTTORC\x08\x03\x10\x03") fp.flush() fp.seek(0) with open(fp.name, "rb") as fp2: with pytest.raises(ParseError): _ = Reader(fp2) fp.write(b'ORC\x08\x03\x10\x03"k\x08\x0c\x12\x0c\x01\x02\x03') fp.flush() fp.seek(0) with open(fp.name, "rt") as fp2: with pytest.raises(ParseError): _ = Reader(fp2) with open(fp.name, "rb") as fp2: with pytest.raises(ParseError): _ = Reader(fp2) fp.seek(0) Writer(fp, "struct<col0:int,col1:string>").close() with open(fp.name, "ab") as fp2: with pytest.raises(io.UnsupportedOperation): _ = Reader(fp2) with open(fp.name, "rb") as fp2: reader = Reader(fp2) assert reader is not None assert len(reader) == 0
def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input, expected): data = io.BytesIO() with Writer(data, schema, timezone=writer_tz) as writer: writer.write((input, )) reader = Reader(data, timezone=reader_tz) output = next(reader)[0] assert output == expected
def test_wrong_predicate(): data = io.BytesIO() with Writer(data, "struct<c0:int,c1:string>", row_index_stride=100) as writer: writer.writerows( (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000)) data.seek(0) with pytest.raises(TypeError): reader = Reader(data, predicate="wrong")
def test_bytes_lengths(): data = io.BytesIO() Writer(data, "string", compression=0).close() reader = Reader(data) assert reader.bytes_lengths["content_length"] == 0 assert reader.bytes_lengths["file_footer_length"] == 38 assert reader.bytes_lengths["file_postscript_length"] == 23 assert reader.bytes_lengths["file_length"] == 65 assert reader.bytes_lengths["stripe_statistics_length"] == 0 data = io.BytesIO() with Writer(data, "int") as writer: writer.writerows(range(100)) reader = Reader(data) assert reader.bytes_lengths["content_length"] == 76 assert reader.bytes_lengths["file_footer_length"] == 59 assert reader.bytes_lengths["file_postscript_length"] == 23 assert reader.bytes_lengths["file_length"] == len(data.getvalue()) assert reader.bytes_lengths["stripe_statistics_length"] == 21
def test_len(): data = io.BytesIO() Writer(data, "struct<col0:int,col1:string>").close() reader = Reader(data) assert len(reader) == 0 data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: writer.write((0, "Test A")) reader = Reader(data) assert len(reader) == 1 data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>") as writer: for i in range(10): writer.write((i, "Test")) reader = Reader(data) assert len(reader) == 10
def test_empty_predicate_result(): data = io.BytesIO() with Writer(data, "struct<c0:int,c1:string>", row_index_stride=100) as writer: writer.writerows( (i, "Even") if i % 2 == 0 else (i, "Odd") for i in range(1000)) data.seek(0) reader = Reader(data, predicate=PredicateColumn(TypeKind.INT, "c0") < 0) assert len(reader) != 0 assert list(reader) == []
def test_writer_id(): data = io.BytesIO() with Writer(data, "int") as writer: writer.writerows(range(10)) reader = Reader(data) with pytest.raises(AttributeError): reader.writer_id = "fail" with pytest.raises(AttributeError): del reader.writer_id assert reader.writer_id == "ORC_CPP_WRITER"
def test_compression(kind): data = io.BytesIO() with Writer(data, "int", compression=kind) as writer: writer.writerows(range(10)) reader = Reader(data) with pytest.raises(AttributeError): reader.compression = "fail" with pytest.raises(AttributeError): del reader.compression assert reader.compression == kind
def test_compression(kind): data = io.BytesIO() with Writer(data, "struct<a:int,b:string,c:double>", compression=kind) as writer: writer.writerows((num, "ABCDEFG", 0.12) for num in range(50000)) data.seek(0) reader = Reader(data) assert reader.compression == kind for idx, row in enumerate(reader): assert row == (idx, "ABCDEFG", 0.12)
def test_read_custom_null_value(orc_type, value): data = io.BytesIO() with Writer(data, orc_type) as writer: writer.write(value) writer.write(None) reader = Reader(data, null_value=NullValue()) if orc_type in ("float", "double"): assert math.isclose(next(reader), value, rel_tol=1e-07, abs_tol=0.0) else: assert next(reader) == value assert next(reader) is NullValue()
def test_current_row(): data = io.BytesIO() writer = Writer(data, "struct<col0:int,col1:string,col2:double>") assert writer.current_row == 0 writer.write((0, "Test A", 0.0001)) assert writer.current_row == 1 for i in range(10): writer.write((i, "Test A", 0.0001)) assert writer.current_row == 11 writer.close() data.seek(0) reader = Reader(data) assert writer.current_row == len(reader)
def _init(row): data = io.BytesIO() with Writer( data, "struct<col0:int>", batch_size=65535, stripe_size=128, compression_block_size=128, ) as writer: for i in range(row): writer.write((i, )) data.seek(0) return data
def _init(schema, rows, bfc=tuple()): data = io.BytesIO() with Writer( data, schema, batch_size=65535, stripe_size=128, compression_block_size=128, bloom_filter_columns=bfc, ) as writer: writer.writerows(rows) data.seek(0) return data
def test_metadata(): data = io.BytesIO() with Writer(data, "int") as writer: writer.set_metadata(test="test1".encode("UTF-8"), meta=b"\x30\x40\x50\x60") writer.set_metadata(test="test2".encode("UTF-8")) with pytest.raises(TypeError): writer.set_metadata(meta="string") reader = Reader(data) assert len(reader) == 0 assert reader.metadata == { "test": "test2".encode("UTF-8"), "meta": b"\x30\x40\x50\x60", }
def _init(row): data = io.BytesIO() with Writer(data, "struct<col0:int,col1:string>", struct_repr=StructRepr.DICT) as writer: for i in range(row): writer.write({ "col0": i, "col1": "Test {0}".format(string.ascii_uppercase[i % 26]), }) data.seek(0) return data
def test_write(): data = io.BytesIO() writer = Writer(data, "struct<col0:int,col1:string,col2:double>") records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)] for rec in records: writer.write(rec) writer.close() data.seek(0) reader = Reader(data) assert reader.read() == records
def test_write_complex_type(orc_type, values): data = io.BytesIO() writer = Writer(data, orc_type, struct_repr=StructRepr.DICT) for rec in values: writer.write(rec) writer.close() data.seek(0) reader = Reader(data, struct_repr=StructRepr.DICT) assert reader.read() == values
def test_schema(): schema_str = "struct<col0:int,col1:string>" data = io.BytesIO() Writer(data, schema_str).close() reader = Reader(data) assert str(reader.schema) == schema_str with pytest.raises(AttributeError): reader.schema = "fail" with pytest.raises(AttributeError): del reader.schema schema = reader.schema del reader assert isinstance(schema, typedescription) assert schema.kind == TypeKind.STRUCT