Exemplo n.º 1
0
 def iterator(self, part_filter=None, row_filter=None, columns=None):
     if part_filter is None and row_filter is None and columns is None:
         return self.iterator(Expressions.always_true(),
                              Expressions.always_true(),
                              Filterable.ALL_COLUMNS)
     return iter([
         self.get_filtered_manifest(path, part_filter, row_filter, columns)
         for path in self._manifest_files
     ])
def test_double_to_float_conversion(assert_and_unwrap):
    struct = StructType.of([NestedField.required(18, "f", FloatType.get())])

    lt = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                          Literal.JAVA_MAX_FLOAT * 2)
    assert lt.bind(struct) == Expressions.always_true()

    lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                             Literal.JAVA_MAX_FLOAT * 2)
    assert lt_eq.bind(struct) == Expressions.always_true()

    gt = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                          Literal.JAVA_MAX_FLOAT * -2)
    assert gt.bind(struct) == Expressions.always_true()

    gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                             Literal.JAVA_MAX_FLOAT * -2)
    assert gt_eq.bind(struct) == Expressions.always_true()

    gt_max = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                              Literal.JAVA_MAX_FLOAT * 2)
    assert gt_max.bind(struct) == Expressions.always_false()

    gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                                 Literal.JAVA_MAX_FLOAT * 2)
    assert gt_eq_max.bind(struct) == Expressions.always_false()

    lt_min = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                              Literal.JAVA_MAX_FLOAT * -2)
    assert lt_min.bind(struct) == Expressions.always_false()

    lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                                 Literal.JAVA_MAX_FLOAT * -2)
    assert lt_eq_min.bind(struct) == Expressions.always_false()

    lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                               Literal.JAVA_MAX_FLOAT).bind(struct)
    lt_max = assert_and_unwrap(lt_expr)
    assert lt_max.lit.value == Literal.JAVA_MAX_FLOAT

    lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                                  Literal.JAVA_MAX_FLOAT).bind(struct)
    lt_eq_max = assert_and_unwrap(lt_eq_expr)
    assert lt_eq_max.lit.value == Literal.JAVA_MAX_FLOAT

    gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                               Literal.JAVA_MIN_INT).bind(struct)
    gt_min = assert_and_unwrap(gt_expr)
    assert gt_min.lit.value == Literal.JAVA_MIN_INT

    gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                                  Literal.JAVA_MIN_INT).bind(struct)
    gt_eq_min = assert_and_unwrap(gt_eq_expr)
    assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_long_to_integer_conversion(assert_and_unwrap):
    struct = StructType.of([NestedField.required(17, "i", IntegerType.get())])

    lt = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                          Literal.JAVA_MAX_INT + 1)
    assert lt.bind(struct) == Expressions.always_true()

    lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MAX_INT + 1)
    assert lt_eq.bind(struct) == Expressions.always_true()

    gt = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                          Literal.JAVA_MIN_INT - 1)
    assert gt.bind(struct) == Expressions.always_true()

    gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MIN_INT - 1)
    assert gt_eq.bind(struct) == Expressions.always_true()

    gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                              Literal.JAVA_MAX_INT + 1)
    assert gt_max.bind(struct) == Expressions.always_false()

    gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MAX_INT + 1)
    assert gt_eq_max.bind(struct) == Expressions.always_false()

    lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                              Literal.JAVA_MIN_INT - 1)
    assert lt_min.bind(struct) == Expressions.always_false()

    lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MIN_INT - 1)
    assert lt_eq_min.bind(struct) == Expressions.always_false()

    lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                               Literal.JAVA_MAX_INT).bind(struct)
    lt_max = assert_and_unwrap(lt_expr)
    assert lt_max.lit.value == Literal.JAVA_MAX_INT

    lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MAX_INT).bind(struct)
    lt_eq_max = assert_and_unwrap(lt_eq_expr)
    assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT

    gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                               Literal.JAVA_MIN_INT).bind(struct)
    gt_min = assert_and_unwrap(gt_expr)
    assert gt_min.lit.value == Literal.JAVA_MIN_INT

    gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MIN_INT).bind(struct)
    gt_eq_min = assert_and_unwrap(gt_eq_expr)
    assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
Exemplo n.º 4
0
 def all_entries(self):
     if self.row_filter is not None and self.row_filter != Expressions.always_true() \
             or self.part_filter is not None and self. part_filter != Expressions.always_true():
         evaluator = self.evaluator()
         metrics_evaluator = self.metrics_evaluator()
         return [
             entry for entry in self.reader.entries(self.columns)
             if entry is not None and evaluator.eval(entry.file.partition())
             and metrics_evaluator.eval(entry.file)
         ]
     else:
         return self.reader.entries(self.columns)
Exemplo n.º 5
0
    def iterator(self):
        if self.row_filter is not None and self.row_filter != Expressions.always_true() \
                or self.part_filter is not None and self.part_filter != Expressions.always_true():
            evaluator = self.evaluator()
            metrics_evaluator = self.metrics_evaluator()

            return (input.copy() for input in self.reader.iterator(
                self.part_filter, self.columns)
                    if input is not None and evaluator.eval(input.partition())
                    and metrics_evaluator.eval(input))
        else:
            return (entry.copy() for entry in self.reader.iterator(
                self.part_filter, self.columns))
Exemplo n.º 6
0
    def iterator(self, part_filter=None, columns=None):
        if part_filter is None and columns is None:
            return self.iterator(Expressions.always_true(),
                                 Filterable.ALL_COLUMNS)

        return (entry.file for entry in self.entries
                if entry.status != Status.DELETED)
Exemplo n.º 7
0
def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table
Exemplo n.º 8
0
    def __init__(self, ops, table, snapshot_id=None, columns=None, row_filter=None):
        self.ops = ops
        self.table = table
        self.snapshot_id = snapshot_id
        self.columns = columns
        self.row_filter = row_filter

        if self.columns is None and self.row_filter is None:
            self.columns = Filterable.ALL_COLUMNS
            self.row_filter = Expressions.always_true()
Exemplo n.º 9
0
def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)
Exemplo n.º 10
0
    def metrics_evaluator(self):
        if self.lazy_metrics_evaluator is None:
            if self.row_filter is not None:
                self.lazy_metrics_evaluator = InclusiveMetricsEvaluator(
                    self.reader.spec.schema, self.row_filter,
                    self.case_sensitive)
            else:
                self.lazy_metrics_evaluator = InclusiveMetricsEvaluator(
                    self.reader.spec.schema, Expressions.always_true(),
                    self.case_sensitive)

        return self.lazy_metrics_evaluator
Exemplo n.º 11
0
    def evaluator(self):
        if self.lazy_evaluator is None:
            if self.part_filter is not None:
                self.lazy_evaluator = Evaluator(
                    self.reader.spec.partition_type(), self.part_filter,
                    self.case_sensitive)
            else:
                self.lazy_evaluator = Evaluator(
                    self.reader.spec.partition_type(),
                    Expressions.always_true(), self.case_sensitive)

        return self.lazy_evaluator
Exemplo n.º 12
0
def test_column_upcast(primitive_type_test_file):
    expected_schema = Schema(
        [NestedField.required(1, "int_col", LongType.get())])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())]
    source_table = pa.table(
        pyarrow_array,
        schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)]))

    target_table = reader.read()
    assert source_table == target_table
Exemplo n.º 13
0
def test_basic_simplification(assert_and_unwrap):
    # Should simplify or expression to alwaysTrue
    assert Expressions.always_true() == Binder.bind(
        STRUCT,
        Expressions.or_(Expressions.less_than("y", 100),
                        Expressions.greater_than("z", -9999999999)))
    # Should simplify or expression to alwaysfalse
    assert Expressions.always_false() == Binder.bind(
        STRUCT,
        Expressions.and_(Expressions.less_than("y", 100),
                         Expressions.less_than("z", -9999999999)))

    bound = Binder.bind(
        STRUCT,
        Expressions.not_(Expressions.not_(Expressions.less_than("y", 100))))
    pred = assert_and_unwrap(bound, None)
    assert 1 == pred.ref.field_id
Exemplo n.º 14
0
def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()
Exemplo n.º 15
0
    def __init__(self, ops, table, schema, snapshot_id=None, columns=None,
                 row_filter=None, case_sensitive=True, selected_columns=None, options=None,
                 minused_cols=None):
        self.ops = ops
        self.table = table
        self._schema = schema
        self.snapshot_id = snapshot_id
        self.columns = columns
        self._row_filter = row_filter
        self._case_sensitive = case_sensitive
        self.selected_columns = selected_columns
        self.minused_cols = minused_cols or list()
        self.options = options if options is not None else dict()

        if self.columns is None and self._row_filter is None:
            self.columns = Filterable.ALL_COLUMNS
            self._row_filter = Expressions.always_true()

        self._stats = dict()
Exemplo n.º 16
0
def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Exemplo n.º 17
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table
Exemplo n.º 18
0
def test_always_true():
    assert Expressions.always_true() == Binder.bind(STRUCT,
                                                    Expressions.always_true())
Exemplo n.º 19
0
    return lambda x: TestHelpers.Row.of(x)


@pytest.fixture(scope="session",
                params=[
                    Operation.LT, Operation.LT_EQ, Operation.GT,
                    Operation.GT_EQ, Operation.EQ, Operation.NOT_EQ
                ])
def op(request):
    yield request.param


@pytest.fixture(scope="session",
                params=[
                    Expressions.always_false(),
                    Expressions.always_true(),
                    Expressions.less_than("x", 5),
                    Expressions.less_than_or_equal("y", -3),
                    Expressions.greater_than("z", 0),
                    Expressions.greater_than_or_equal("t", 129),
                    Expressions.equal("col", "data"),
                    Expressions.not_equal("col", "abc"),
                    Expressions.not_null("maybeNull"),
                    Expressions.is_null("maybeNull2"),
                    Expressions.not_(Expressions.greater_than("a", 10)),
                    Expressions.and_(Expressions.greater_than_or_equal("a", 0),
                                     Expressions.less_than("a", 3)),
                    Expressions.or_(Expressions.less_than("a", 0),
                                    Expressions.greater_than("a", 10)),
                    Expressions.equal("a", 5).bind(exp_schema.as_struct())
                ])
Exemplo n.º 20
0
 def filter_partitions(self, expr):
     return FilteredManifest(self, expr, Expressions.always_true(),
                             ManifestReader.ALL_COLUMNS,
                             self.case_sensitive)
Exemplo n.º 21
0
 def select(self, columns):
     return FilteredManifest(self, Expressions.always_true(),
                             Expressions.always_true(), list(columns),
                             self.case_sensitive)
Exemplo n.º 22
0
 def select(self, columns):
     return FilteredSnapshot(self, Expressions.always_true(),
                             Expressions.always_true(), columns)
Exemplo n.º 23
0
    def join_filters(expressions):
        result = Expressions.always_true()
        for expression in expressions:
            result = Expressions.and_(result, expression)

        return result
Exemplo n.º 24
0
 def filter_rows(self, expr):
     return FilteredSnapshot(self, Expressions.always_true(), expr,
                             Filterable.ALL_COLUMNS)