示例#1
0
def test_compare_scalar(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr = con([1, 2, 3, None])
    scalar = pa.scalar(2)

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="int64")
        result = pc.equal(arr, nascalar)
        assert result.to_pylist() == [None, None, None, None]

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
示例#2
0
def test_compare_array(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr1 = con([1, 2, 3, 4, None])
    arr2 = con([1, 1, 4, None, 4])

    result = pc.equal(arr1, arr2)
    assert result.equals(con([True, False, False, None, None]))

    result = pc.not_equal(arr1, arr2)
    assert result.equals(con([False, True, True, None, None]))

    result = pc.less(arr1, arr2)
    assert result.equals(con([False, False, True, None, None]))

    result = pc.less_equal(arr1, arr2)
    assert result.equals(con([True, False, True, None, None]))

    result = pc.greater(arr1, arr2)
    assert result.equals(con([False, True, False, None, None]))

    result = pc.greater_equal(arr1, arr2)
    assert result.equals(con([True, True, False, None, None]))
示例#3
0
def test_compare_scalar(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr = con([1, 2, 3, None])
    # TODO this is a hacky way to construct a scalar ..
    scalar = pa.array([2]).sum()

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
示例#4
0
def test_compare_string_scalar(typ):
    if typ == "array":
        def con(values): return pa.array(values)
    else:
        def con(values): return pa.chunked_array([values])

    arr = con(['a', 'b', 'c', None])
    scalar = pa.scalar('b')

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="string")
        result = pc.equal(arr, nascalar)
        isnull = pc.is_null(result)
        assert isnull.equals(con([True, True, True, True]))

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
示例#5
0
def binary_col(op, l, r):
    """
  interpretor for executing binary operator expressions
  """
    if op == "+": return compute.add_checked(l, r)
    if op == "*": return compute.multiply_checked(l, r)
    if op == '-': return compute.subtract_checked(l, r)
    if op == "=": return compute.equal(l, r)
    if op == "<>": return compute.not_equal(l, r)
    if op == "!=": return compute.not_equal(l, r)
    if op == "or": return compute.or_(l, r)
    if op == "<": return compute.less(l, r)
    if op == ">": return compute.greater(l, r)
    if op == "/": return compute.divide_checked(l, r)
    if op == "and": return compute.and_(l, r)
    if op == "in": return compute.is_in(l, r)
    if op == "==": return compute.equal(l, r)
    if op == "<=": return compute.less_equal(l, r)
    if op == ">=": return compute.greater_equal(l, r)
    raise Exception("binary op not implemented")
示例#6
0
def _replay_single(
    dname: str,
    factors: List[Factor],
    *,
    predicate: Optional[Factor] = None,
    batch_size: int = 40960,
    trim: bool = False,
    index_col: Optional[str] = None,
    n_factor_jobs: int = 1,
    verbose: bool = False,
) -> Tuple[pa.Table, Set[str]]:
    if predicate is not None:
        # put the predicate as the last
        replay_result = _native_replay(
            dname, [*factors, predicate], batch_size=batch_size, njobs=n_factor_jobs
        )
    else:
        replay_result = _native_replay(
            dname, factors, batch_size=batch_size, njobs=n_factor_jobs
        )

    table_datas, table_names = [], []

    if index_col is not None:
        index = pq.read_table(dname, columns=[index_col]).column(index_col)
        table_datas.append(index)
        table_names.append(index_col)

    predicate_values = None
    for i, (data_ptr, schema_ptr) in replay_result["succeeded"].items():
        arr = pa.Array._import_from_c(data_ptr, schema_ptr)

        if predicate is not None and i == len(factors):  # is the predicate col
            predicate_values = arr
        else:
            table_datas.append(arr)
            table_names.append(str(factors[i]))

    # Fill in the failed columns
    N = replay_result["nrows"]
    nanarr = pa.array(np.empty(N, "f8"), mask=np.ones(N, "b1"))

    for i, reason in replay_result["failed"].items():
        if predicate is not None and i == len(factors):
            raise ValueError("predicate failed to compute: {}", reason)
        else:
            table_datas.append(nanarr)
            table_names.append(str(factors[i]))

        if verbose:
            print(f"{factors[i]} failed: {reason}", file=stderr)

    tb = pa.Table.from_arrays(
        table_datas,
        names=table_names,
    )

    if trim:
        if index_col is not None:
            # the first column is the index
            data_starts = 1
        else:
            data_starts = 0

        ready_offset = np.max(
            [Factor(col).ready_offset() for col in tb.column_names[data_starts:]]
        )

        tb = tb.slice(ready_offset)

        # trim predicate as well
        if predicate_values is not None:
            predicate_values = predicate_values.slice(ready_offset)

    if predicate is not None:
        assert (
            predicate_values is not None
        ), "predicate_values is none, this is not possible"

        # filter the table using the predicate
        tb = pc.filter(tb, pc.greater(predicate_values, 0.0))

    if index_col is not None:
        # sort the columns based on the order passed in
        tb = tb.select([index_col] + [str(f) for f in factors])

        # set the metadata for the index col, so that when `.to_pandas` is called,
        # the index col automatically becomes the index.
        header = tb.slice(0).to_pandas()
        header = header.set_index(index_col)
        _, _, metadata = pa.pandas_compat.dataframe_to_types(header, True)
        tb = tb.replace_schema_metadata(metadata)
    else:
        # sort the columns based on the order passed in
        tb = tb.select([str(f) for f in factors])

    return (
        tb,
        {str(factors[k]) for k in replay_result["failed"].keys()},
    )