示例#1
0
def combine_missing(a, b):
    # return a copy of a with missing values of a and b combined
    if a.null_count > 0 or b.null_count > 0:
        a, b = vaex.arrow.convert.align(a, b)
        if isinstance(a, pa.ChunkedArray):
            # divide and conquer
            assert isinstance(b, pa.ChunkedArray)
            assert len(a.chunks) == len(b.chunks)
            return pa.chunked_array([
                combine_missing(ca, cb) for ca, cb in zip(a.chunks, b.chunks)
            ])
        if a.offset != 0:
            a = vaex.arrow.convert.trim_buffers_ipc(a)
        if b.offset != 0:
            b = vaex.arrow.convert.trim_buffers_ipc(b)
        assert a.offset == 0
        assert b.offset == 0
        # not optimal
        nulls = pc.invert(pc.or_(a.is_null(), b.is_null()))
        assert nulls.offset == 0
        nulls_buffer = nulls.buffers()[1]
        # this is not the case: no reason why it should be (TODO: open arrow issue)
        # assert nulls.buffers()[0] is None
        buffers = a.buffers()
        return pa.Array.from_buffers(a.type, len(a),
                                     [nulls_buffer] + buffers[1:])
    else:
        return a
示例#2
0
def unary_col(op, v):
    """
  interpretor for executing unary operator expressions on columnars
  """
    if op == "+":
        return v
    if op == "-":
        return compute.subtract(0.0, v)
    if op.lower() == "not":
        return compute.invert(v)
    raise Exception("unary op not implemented")
示例#3
0
def test_logical():
    a = pa.array([True, False, False, None])
    b = pa.array([True, True, False, True])

    assert pc.and_(a, b) == pa.array([True, False, False, None])
    assert pc.and_kleene(a, b) == pa.array([True, False, False, None])

    assert pc.or_(a, b) == pa.array([True, True, False, None])
    assert pc.or_kleene(a, b) == pa.array([True, True, False, True])

    assert pc.xor(a, b) == pa.array([False, True, False, None])

    assert pc.invert(a) == pa.array([False, True, True, None])
示例#4
0
def combine_missing(a, b):
    assert a.offset == 0
    if a.null_count > 0 or b.null_count > 0:
        # not optimal
        nulls = pc.invert(pc.or_(a.is_null(), b.is_null()))
        assert nulls.offset == 0
        nulls_buffer = nulls.buffers()[1]
        # this is not the case: no reason why it should be (TODO: open arrow issue)
        # assert nulls.buffers()[0] is None
    else:
        nulls_buffer = None
    buffers = a.buffers()
    return pa.Array.from_buffers(a.type, len(a), [nulls_buffer, buffers[1]])
示例#5
0
文件: array.py 项目: YarShev/pandas
 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
     if pa_version_under2p0:
         raise NotImplementedError(
             "__invert__ not implement for pyarrow < 2.0")
     return type(self)(pc.invert(self._data))
    ),
)

# remove unused columns
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])

# truncate the title after 101 characters (matching display logic)
truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="")
table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title)

# ensure all dictionaries in the file use the same key/value mappings
table = table.unify_dictionaries()

# filter out non-numeric dates (e.g. null, "1850-1853")
# matches the hack in index.js:37
mask = pc.invert(pc.is_null(table.column("date")))
table = table.filter(mask)

# sorting by the date improves the loading aesthetics
# comment this out to exactly match the original appearance
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = pc.take(table, indices)

# after sorting replace ix with an accurate row index
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32()))

temp_path.unlink()

local = fs.LocalFileSystem()
示例#7
0
import os

from pyarrow import csv
import pyarrow.compute as pc
import pyarrow.parquet as pq

table = csv.read_csv(
    "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz")

# year_column = pd.Series(np.full(len(table), 2020))
# month_column = pd.Series(np.full(len(table), 1))
# table = table.append_column("year", pa.Array.from_pandas(year_column))
# table = table.append_column("month", pa.Array.from_pandas(month_column))

table = table.filter(pc.invert(table["VendorID"].is_null()))
table = table.filter(pc.invert(table["passenger_count"].is_null()))

pq.write_to_dataset(table,
                    root_path="all.parquet",
                    partition_cols=["VendorID", "passenger_count"])
all_data = pq.read_table("all.parquet/")
dataset = pq.ParquetDataset("all.parquet/")
dir(dataset)
dataset.pieces
ds_all_data = dataset.read()
data_dir = "all.parquet/VendorID=1/passenger_count=3"
parquet_fname = os.listdir(data_dir)[0]
v1p3 = pq.read_table(f"{data_dir}/{parquet_fname}")
print(v1p3)