def combine_missing(a, b): # return a copy of a with missing values of a and b combined if a.null_count > 0 or b.null_count > 0: a, b = vaex.arrow.convert.align(a, b) if isinstance(a, pa.ChunkedArray): # divide and conquer assert isinstance(b, pa.ChunkedArray) assert len(a.chunks) == len(b.chunks) return pa.chunked_array([ combine_missing(ca, cb) for ca, cb in zip(a.chunks, b.chunks) ]) if a.offset != 0: a = vaex.arrow.convert.trim_buffers_ipc(a) if b.offset != 0: b = vaex.arrow.convert.trim_buffers_ipc(b) assert a.offset == 0 assert b.offset == 0 # not optimal nulls = pc.invert(pc.or_(a.is_null(), b.is_null())) assert nulls.offset == 0 nulls_buffer = nulls.buffers()[1] # this is not the case: no reason why it should be (TODO: open arrow issue) # assert nulls.buffers()[0] is None buffers = a.buffers() return pa.Array.from_buffers(a.type, len(a), [nulls_buffer] + buffers[1:]) else: return a
def unary_col(op, v): """ interpretor for executing unary operator expressions on columnars """ if op == "+": return v if op == "-": return compute.subtract(0.0, v) if op.lower() == "not": return compute.invert(v) raise Exception("unary op not implemented")
def test_logical(): a = pa.array([True, False, False, None]) b = pa.array([True, True, False, True]) assert pc.and_(a, b) == pa.array([True, False, False, None]) assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) assert pc.or_(a, b) == pa.array([True, True, False, None]) assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) assert pc.xor(a, b) == pa.array([False, True, False, None]) assert pc.invert(a) == pa.array([False, True, True, None])
def combine_missing(a, b): assert a.offset == 0 if a.null_count > 0 or b.null_count > 0: # not optimal nulls = pc.invert(pc.or_(a.is_null(), b.is_null())) assert nulls.offset == 0 nulls_buffer = nulls.buffers()[1] # this is not the case: no reason why it should be (TODO: open arrow issue) # assert nulls.buffers()[0] is None else: nulls_buffer = None buffers = a.buffers() return pa.Array.from_buffers(a.type, len(a), [nulls_buffer, buffers[1]])
def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: if pa_version_under2p0: raise NotImplementedError( "__invert__ not implement for pyarrow < 2.0") return type(self)(pc.invert(self._data))
), ) # remove unused columns table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"]) # truncate the title after 101 characters (matching display logic) truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="") table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title) # ensure all dictionaries in the file use the same key/value mappings table = table.unify_dictionaries() # filter out non-numeric dates (e.g. null, "1850-1853") # matches the hack in index.js:37 mask = pc.invert(pc.is_null(table.column("date"))) table = table.filter(mask) # sorting by the date improves the loading aesthetics # comment this out to exactly match the original appearance indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = pc.take(table, indices) # after sorting replace ix with an accurate row index indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32())) temp_path.unlink() local = fs.LocalFileSystem()
import os from pyarrow import csv import pyarrow.compute as pc import pyarrow.parquet as pq table = csv.read_csv( "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz") # year_column = pd.Series(np.full(len(table), 2020)) # month_column = pd.Series(np.full(len(table), 1)) # table = table.append_column("year", pa.Array.from_pandas(year_column)) # table = table.append_column("month", pa.Array.from_pandas(month_column)) table = table.filter(pc.invert(table["VendorID"].is_null())) table = table.filter(pc.invert(table["passenger_count"].is_null())) pq.write_to_dataset(table, root_path="all.parquet", partition_cols=["VendorID", "passenger_count"]) all_data = pq.read_table("all.parquet/") dataset = pq.ParquetDataset("all.parquet/") dir(dataset) dataset.pieces ds_all_data = dataset.read() data_dir = "all.parquet/VendorID=1/passenger_count=3" parquet_fname = os.listdir(data_dir)[0] v1p3 = pq.read_table(f"{data_dir}/{parquet_fname}") print(v1p3)