def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) scalar = pa.scalar(2) result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") result = pc.equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_compare_array(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) arr2 = con([1, 1, 4, None, 4]) result = pc.equal(arr1, arr2) assert result.equals(con([True, False, False, None, None])) result = pc.not_equal(arr1, arr2) assert result.equals(con([False, True, True, None, None])) result = pc.less(arr1, arr2) assert result.equals(con([False, False, True, None, None])) result = pc.less_equal(arr1, arr2) assert result.equals(con([True, False, True, None, None])) result = pc.greater(arr1, arr2) assert result.equals(con([False, True, False, None, None])) result = pc.greater_equal(arr1, arr2) assert result.equals(con([True, True, False, None, None]))
def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) # TODO this is a hacky way to construct a scalar .. scalar = pa.array([2]).sum() result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_compare_string_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") result = pc.equal(arr, nascalar) isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def binary_col(op, l, r): """ interpretor for executing binary operator expressions """ if op == "+": return compute.add_checked(l, r) if op == "*": return compute.multiply_checked(l, r) if op == '-': return compute.subtract_checked(l, r) if op == "=": return compute.equal(l, r) if op == "<>": return compute.not_equal(l, r) if op == "!=": return compute.not_equal(l, r) if op == "or": return compute.or_(l, r) if op == "<": return compute.less(l, r) if op == ">": return compute.greater(l, r) if op == "/": return compute.divide_checked(l, r) if op == "and": return compute.and_(l, r) if op == "in": return compute.is_in(l, r) if op == "==": return compute.equal(l, r) if op == "<=": return compute.less_equal(l, r) if op == ">=": return compute.greater_equal(l, r) raise Exception("binary op not implemented")
def _replay_single( dname: str, factors: List[Factor], *, predicate: Optional[Factor] = None, batch_size: int = 40960, trim: bool = False, index_col: Optional[str] = None, n_factor_jobs: int = 1, verbose: bool = False, ) -> Tuple[pa.Table, Set[str]]: if predicate is not None: # put the predicate as the last replay_result = _native_replay( dname, [*factors, predicate], batch_size=batch_size, njobs=n_factor_jobs ) else: replay_result = _native_replay( dname, factors, batch_size=batch_size, njobs=n_factor_jobs ) table_datas, table_names = [], [] if index_col is not None: index = pq.read_table(dname, columns=[index_col]).column(index_col) table_datas.append(index) table_names.append(index_col) predicate_values = None for i, (data_ptr, schema_ptr) in replay_result["succeeded"].items(): arr = pa.Array._import_from_c(data_ptr, schema_ptr) if predicate is not None and i == len(factors): # is the predicate col predicate_values = arr else: table_datas.append(arr) table_names.append(str(factors[i])) # Fill in the failed columns N = replay_result["nrows"] nanarr = pa.array(np.empty(N, "f8"), mask=np.ones(N, "b1")) for i, reason in replay_result["failed"].items(): if predicate is not None and i == len(factors): raise ValueError("predicate failed to compute: {}", reason) else: table_datas.append(nanarr) table_names.append(str(factors[i])) if verbose: print(f"{factors[i]} failed: {reason}", file=stderr) tb = pa.Table.from_arrays( table_datas, names=table_names, ) if trim: if index_col is not None: # the first column is the index data_starts = 1 else: data_starts = 0 ready_offset = np.max( [Factor(col).ready_offset() for col in tb.column_names[data_starts:]] ) tb = tb.slice(ready_offset) # trim predicate as well if predicate_values is not None: predicate_values = predicate_values.slice(ready_offset) if predicate is not None: assert ( predicate_values is not None ), "predicate_values is none, this is not possible" # filter the table using the predicate tb = pc.filter(tb, pc.greater(predicate_values, 0.0)) if index_col is not None: # sort the columns based on the order passed in tb = tb.select([index_col] + [str(f) for f in factors]) # set the metadata for the index col, so that when `.to_pandas` is called, # the index col automatically becomes the index. header = tb.slice(0).to_pandas() header = header.set_index(index_col) _, _, metadata = pa.pandas_compat.dataframe_to_types(header, True) tb = tb.replace_schema_metadata(metadata) else: # sort the columns based on the order passed in tb = tb.select([str(f) for f in factors]) return ( tb, {str(factors[k]) for k in replay_result["failed"].keys()}, )