Пример #1
0
def test_bytes_offset(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 1)

    assert stripe.bytes_offset == 658  # Bold, hardcoded offset value.
    with pytest.raises(AttributeError):
        stripe.bytes_offset = 5
Пример #2
0
def test_bytes_length(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 1)

    assert stripe.bytes_length == 392  # Bold, hardcoded length value.
    with pytest.raises(AttributeError):
        stripe.bytes_length = "false"
Пример #3
0
def test_row_offset(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe0 = Stripe(reader, 0)

    assert stripe0.row_offset == 0
    assert Stripe(reader, 1).row_offset == len(stripe0)
    with pytest.raises(AttributeError):
        stripe0.row_offset = 5
Пример #4
0
def test_init(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    with pytest.raises(TypeError):
        _ = Stripe(None, 0)
    with pytest.raises(TypeError):
        _ = Stripe("reader", 0)
    with pytest.raises(IndexError):
        _ = Stripe(reader, 3)
    with pytest.raises(TypeError):
        _ = Stripe(reader, "col")
    assert Stripe(reader, 0) is not None
Пример #5
0
def test_writer_timezone(striped_orc_data):
    def get_dt():
        start = datetime(2010, 9, 1, 7, 0, 0, 0, timezone.utc)
        end = datetime(2010, 9, 10, 12, 0, 0, 0, timezone.utc)
        while start <= end:
            yield start
            start += timedelta(seconds=10)

    data = striped_orc_data("timestamp", get_dt())
    reader = Reader(data)
    stripe = Stripe(reader, 1)

    assert stripe.writer_timezone == "UTC"
    with pytest.raises(AttributeError):
        stripe.writer_timezone = "UTC-9:00"
Пример #6
0
def test_statistics_string(striped_orc_data):
    data = striped_orc_data("string", ("Test String {0}".format(i + 1)
                                       for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["kind"] == TypeKind.STRING
    assert stat["number_of_values"] == 65535
    assert stat["total_length"] == sum(len(i) for i in stripe)
    assert stat["minimum"] == "Test String 1"
    assert stat["maximum"] == max(i for i in Stripe(reader, 0))
    stat = reader[0].statistics
    assert stat["maximum"] == max(i for i in reader)
    assert reader.read_stripe(
        1)[0].statistics["minimum"] == "Test String 100000"
Пример #7
0
def test_len(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)

    assert len(reader) != len(stripe)
    assert len(stripe) == 65535
Пример #8
0
def test_getitem(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    col = reader[0]
    assert col is not None
    col = stripe[0]
    assert col is not None
Пример #9
0
def test_bloom_filter_columns(striped_orc_data):
    expected = (0, 1)
    data = striped_orc_data(
        "struct<col0:int,col1:string>",
        ((i, "Test {}".format(i + 1)) for i in range(100000)),
        bfc=expected,
    )
    reader = Reader(data)
    assert Stripe(reader, 0).bloom_filter_columns == expected
    assert Stripe(reader, 1).bloom_filter_columns == expected

    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    assert stripe.bloom_filter_columns == tuple()
    with pytest.raises(AttributeError):
        stripe.bloom_filter_columns = (0,)
Пример #10
0
def test_statistics_binary(striped_orc_data):
    data = striped_orc_data("binary",
                            (b"\x4D\x45\x34\x01" for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["kind"] == TypeKind.BINARY
    assert stat["number_of_values"] == 65535
    assert stat["total_length"] == sum(len(i) for i in stripe)
    stat = reader[0].statistics
    assert stat["total_length"] == sum(len(i) for i in reader)
Пример #11
0
def test_statistics_date(striped_orc_data):
    data = striped_orc_data("date", (date(1900, 1, 1) + timedelta(days=i)
                                     for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["kind"] == TypeKind.DATE
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["minimum"] == date(1900, 1, 1)
    assert stat["maximum"] == date(2079, 6, 5)
    stat = reader[0].statistics
    assert stat["maximum"] == max(i for i in reader)
Пример #12
0
def test_statistics_int(striped_orc_data):
    data = striped_orc_data("int", (i for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["kind"] == TypeKind.INT
    assert stat["minimum"] == 0
    assert stat["maximum"] == 65534
    assert stat["sum"] == sum(i for i in range(len(stripe)))
    stat = reader[0].statistics
    assert stat["minimum"] == 0
    assert stat["maximum"] == 99999
    assert stat["sum"] == sum(i for i in range(100000))
    assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
Пример #13
0
def test_statistics_double(striped_orc_data):
    data = striped_orc_data("double", (i * 0.1 for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["kind"] == TypeKind.DOUBLE
    assert stat["minimum"] == 0
    assert math.isclose(stat["maximum"], 6553.4)
    assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe)))
    stat = reader[0].statistics
    assert stat["minimum"] == 0
    assert math.isclose(stat["maximum"], 9999.9)
    assert stat["sum"] == sum(i * 0.1 for i in range(100000))
    assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
Пример #14
0
def test_init(striped_orc_data):
    data = striped_orc_data("struct<a:int,b:int>",
                            ((i, i * 5) for i in range(100000)))
    reader = Reader(data, column_indices=(1, ))
    stripe = Stripe(reader, 0)
    with pytest.raises(TypeError):
        _ = Column(stripe, "0")
    with pytest.raises(IndexError):
        _ = Column(stripe, 100)
    with pytest.raises(IndexError):
        _ = Column(reader, 100)
    with pytest.raises(IndexError):
        _ = Column(reader, 1)
    col = Column(stripe, 0)
    assert col is not None
    col = Column(reader, 0)
    assert col is not None
Пример #15
0
def test_statistics_bool(striped_orc_data):
    data = striped_orc_data("struct<a:boolean>",
                            (((True, False, None)[i % 3], )
                             for i in range(100000)))
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["has_null"] is False
    assert stat["number_of_values"] == 65535
    assert stat["kind"] == TypeKind.STRUCT
    stat = stripe[1].statistics
    assert stat["has_null"] is True
    assert stat["kind"] == TypeKind.BOOLEAN
    assert stat["number_of_values"] == 43690
    assert stat["false_count"] == 21845
    assert stat["true_count"] == len([i for i, in stripe if i is True])
    stat = reader[1].statistics
    assert stat["has_null"] is True
    assert stat["number_of_values"] == 66667
    assert stat["false_count"] == len([i for i, in reader if i is False])
    assert stat["true_count"] == 33334
    assert reader[0].statistics["number_of_values"] == 100000
Пример #16
0
def test_statistics_decimal(striped_orc_data):
    data = striped_orc_data(
        "decimal(10,3)",
        (Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)),
    )
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["kind"] == TypeKind.DECIMAL
    assert stat["has_null"] is False
    assert stat["number_of_values"] == len(stripe)
    assert stat["minimum"] == Decimal("1010.100")
    assert stat["maximum"] == Decimal("7563.500")
    assert stat["sum"] == sum(
        Decimal("1000.1") + Decimal((i + 100) * 0.1)
        for i in range(len(stripe))).quantize(Decimal("1.000"))
    stat = reader[0].statistics
    assert stat["sum"] == sum(
        Decimal("1000.1") + Decimal((i + 100) * 0.1)
        for i in range(100000)).quantize(Decimal("1.000"))
    assert reader.read_stripe(1)[0].statistics["minimum"] == Decimal(
        "7563.600")
Пример #17
0
def test_statistics_timestamp(striped_orc_data):
    data = striped_orc_data(
        "timestamp",
        (datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) +
         timedelta(minutes=i) for i in range(100000)),
    )
    reader = Reader(data)
    stripe = Stripe(reader, 0)
    stat = stripe[0].statistics
    assert stat["kind"] == TypeKind.TIMESTAMP
    assert stat["has_null"] is False
    assert stat["number_of_values"] == len(stripe)
    assert stat["minimum"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc)
    assert stat["maximum"] == max(i for i in stripe)
    assert stat["lower_bound"] == datetime(2000,
                                           1,
                                           1,
                                           12,
                                           0,
                                           tzinfo=timezone.utc)
    assert stat["upper_bound"] == datetime(2000,
                                           2,
                                           16,
                                           0,
                                           14,
                                           0,
                                           1000,
                                           tzinfo=timezone.utc)
    stat = reader[0].statistics
    assert stat["maximum"] == max(i for i in reader)
    assert stat["upper_bound"] == datetime(2000,
                                           3,
                                           10,
                                           22,
                                           39,
                                           0,
                                           1000,
                                           tzinfo=timezone.utc)