def test_ternary_condition(): expected_expr = Expressions.and_( Expressions.equal("col_a", 1), Expressions.and_(Expressions.equal("col_b", 2), Expressions.equal("col_c", 3))) conv_expr = Expressions.convert_string_to_expr( "col_a=1 and col_b=2 and col_c=3") assert expected_expr == conv_expr
def test_and(schema, file): assert not InclusiveMetricsEvaluator( schema, Expressions.and_(Expressions.less_than( "id", 5), Expressions.greater_than_or_equal("id", 0))).eval(file) assert InclusiveMetricsEvaluator( schema, Expressions.and_(Expressions.greater_than("id", 5), Expressions.less_than_or_equal("id", 30))).eval(file)
def test_complex_expansion(): expected_expr = Expressions.or_( Expressions.and_( Expressions.equal("a", 1), Expressions.and_(Expressions.equal("b", 2), Expressions.not_equal("c", 3))), Expressions.is_null("d")) conv_expr = Expressions.convert_string_to_expr( "(a=1 and b=2 and c<>3) or d is null") assert expected_expr == conv_expr
def test_and(strict_schema, strict_file): assert not StrictMetricsEvaluator(strict_schema, Expressions.and_(Expressions.greater_than("id", 5), Expressions.less_than_or_equal("id", 30))).eval(strict_file) assert not StrictMetricsEvaluator(strict_schema, Expressions.and_(Expressions.less_than("id", 5), Expressions.greater_than_or_equal("id", 0))).eval(strict_file) assert StrictMetricsEvaluator(strict_schema, Expressions.and_(Expressions.less_than("id", 85), Expressions.greater_than_or_equal("id", 0))).eval(strict_file)
def test_missing_reference(): expr = Expressions.and_(Expressions.equal("t", 5), Expressions.equal("x", 7)) try: Binder.bind(STRUCT, expr) except ice_ex.ValidationException as e: assert "Cannot find field 't' in struct" in "{}".format(e)
def test_multiple_references(assert_all_bound): expr = Expressions.or_( Expressions.and_(Expressions.equal("x", 7), Expressions.less_than("y", 100)), Expressions.greater_than("z", -100)) assert_all_bound("Multiple references", Binder.bind(STRUCT, expr))
def test_precedence_explicit(): expected_expr = Expressions.and_( Expressions.equal("col_a", 1), Expressions.or_(Expressions.equal("col_b", 2), Expressions.equal("col_c", 3))) conv_expr = Expressions.convert_string_to_expr( "col_a=1 and (col_b=2 or col_c=3)") assert expected_expr == conv_expr
def test_precedence_with_between(): expected_expr = Expressions.or_( Expressions.and_(Expressions.greater_than_or_equal("col_a", 1), Expressions.less_than_or_equal("col_a", 2)), Expressions.equal("col_c", 3)) conv_expr = Expressions.convert_string_to_expr( "col_a between 1 and 2 or col_c=3") assert expected_expr == conv_expr
def test_precedence_opposite_order(): expected_expr = Expressions.or_( Expressions.and_(Expressions.equal("col_a", 1), Expressions.equal("col_b", 2)), Expressions.equal("col_c", 3)) conv_expr = Expressions.convert_string_to_expr( "col_a=1 and col_b=2 or col_c=3") assert expected_expr == conv_expr
def test_complex_expr(): expr = Expressions.or_( Expressions.and_(Expressions.greater_than('a', 1), Expressions.equal("b", "US")), Expressions.equal("c", True)) translated_dataset_filter = get_dataset_filter(expr, { 'a': 'a', 'b': 'b', 'c': 'c' }) dataset_filter = (((ds.field("a") > 1) & (ds.field("b") == "US")) | (ds.field("c") == True)) # noqa: E712 assert dataset_filter.equals(translated_dataset_filter)
def test_and(assert_all_bound, assert_and_unwrap): expr = Expressions.and_(Expressions.equal("x", 7), Expressions.less_than("y", 100)) bound_expr = Binder.bind(STRUCT, expr) assert_all_bound("And", bound_expr) and_ = assert_and_unwrap(bound_expr, And) left = assert_and_unwrap(and_.left, None) # should bind x correctly assert 0 == left.ref.field_id right = assert_and_unwrap(and_.right, None) # should bind y correctly assert 1 == right.ref.field_id
def test_basic_simplification(assert_and_unwrap): # Should simplify or expression to alwaysTrue assert Expressions.always_true() == Binder.bind( STRUCT, Expressions.or_(Expressions.less_than("y", 100), Expressions.greater_than("z", -9999999999))) # Should simplify or expression to alwaysfalse assert Expressions.always_false() == Binder.bind( STRUCT, Expressions.and_(Expressions.less_than("y", 100), Expressions.less_than("z", -9999999999))) bound = Binder.bind( STRUCT, Expressions.not_(Expressions.not_(Expressions.less_than("y", 100)))) pred = assert_and_unwrap(bound, None) assert 1 == pred.ref.field_id
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def join_filters(expressions): result = Expressions.always_true() for expression in expressions: result = Expressions.and_(result, expression) return result
def filter_rows(self, expr): projected = inclusive(self.reader.spec).project(expr) return FilteredManifest(self.reader, Expressions.and_(self.part_filter, projected), Expressions.and_(self.row_filter, expr), self.columns, self.case_sensitive)
def filter_partitions(self, expr): return FilteredManifest(self.reader, Expressions.and_(self.part_filter, expr), self.row_filter, self.columns, self.case_sensitive)
def test_and(inc_man_spec, inc_man_file, expr1, expr2, expected): assert InclusiveManifestEvaluator(inc_man_spec, Expressions.and_(expr1, expr2)).eval(inc_man_file) == expected
@pytest.fixture(scope="session", params=[ Expressions.always_false(), Expressions.always_true(), Expressions.less_than("x", 5), Expressions.less_than_or_equal("y", -3), Expressions.greater_than("z", 0), Expressions.greater_than_or_equal("t", 129), Expressions.equal("col", "data"), Expressions.not_equal("col", "abc"), Expressions.not_null("maybeNull"), Expressions.is_null("maybeNull2"), Expressions.not_(Expressions.greater_than("a", 10)), Expressions.and_(Expressions.greater_than_or_equal("a", 0), Expressions.less_than("a", 3)), Expressions.or_(Expressions.less_than("a", 0), Expressions.greater_than("a", 10)), Expressions.equal("a", 5).bind(exp_schema.as_struct()) ]) def expression(request): yield request.param @pytest.fixture(scope="session", params=[ Expressions.less_than("no_stats", 5), Expressions.less_than_or_equal("no_stats", 30), Expressions.equal("no_stats", 70), Expressions.greater_than("no_stats", 78), Expressions.greater_than_or_equal("no_stats", 90),
def test_between(): expected_expr = Expressions.and_( Expressions.greater_than_or_equal("col_a", 1), Expressions.less_than_or_equal("col_a", 2)) conv_expr = Expressions.convert_string_to_expr("col_a between 1 and 2") assert expected_expr == conv_expr
def test_and(): expected_expr = Expressions.and_(Expressions.equal("col_a", 1), Expressions.equal("col_b", 2)) conv_expr = Expressions.convert_string_to_expr("col_a=1 and col_b=2") assert expected_expr == conv_expr
def filter(self, expr): return self.new_refined_scan(self.ops, self.table, self._schema, snapshot_id=self.snapshot_id, row_filter=Expressions.and_(self._row_filter, expr), case_sensitive=self._case_sensitive, selected_columns=self.selected_columns, options=self.options, minused_cols=self.minused_cols)