def set_primary_ipart(self, ipart, recurse=True): self.data.set_primary_ipart(ipart) if self.nullable: self.bitmask.set_primary_ipart(ipart) if recurse: if ty.is_string_dtype(self.dtype): ranges = self.children[0] values = self.children[1] ranges.set_primary_ipart(ipart) assert ty.is_range_dtype(ranges.data.dtype) values.set_primary_ipart( self.runtime.create_partition_by_image( values.ispace, self.cspace, ranges.data, ipart, kind=legion.DISJOINT_COMPLETE_KIND, range=True, )) else: for child in self.children: child.set_primary_ipart(ipart)
def create_future_from_scalar(self, scalar): if scalar.valid: if ty.is_string_dtype(scalar.dtype): size = len(scalar._value) buf = struct.pack( f"iiQ{size}s", scalar.valid, ty.encode_dtype(scalar.dtype), size, scalar._value.encode("utf-8"), ) else: fmt = ty.to_format_string(scalar.dtype.storage_dtype) buf = struct.pack( "ii" + fmt, scalar.valid, ty.encode_dtype(scalar.dtype), scalar._value, ) else: buf = struct.pack("iiQ", scalar.valid, ty.encode_dtype(scalar.dtype), 0) fut = Future() fut.set_value(self._runtime, buf, len(buf)) return PandasFuture(self, fut, scalar.dtype, True)
def all_to_offsets(self): children = [child.all_to_offsets() for child in self.children] if ty.is_string_dtype(self.dtype): children[0] = children[0].to_offsets() new_self = self.clone(shallow=True) new_self.set_primary_ipart(self.primary_ipart, recurse=False) new_self.children = children return new_self
def binary_op(self, op, other): reverse = False if op in _REVERSED_OPS: op = op[1:] reverse = True # Perform binary operation rhs1 = self._columns if is_scalar(other): other = self._runtime.create_scalar(other, ty.infer_dtype(other)) rhs2 = [other] * len(rhs1) else: rhs2 = other._columns results = [] for rh1, rh2 in zip(rhs1, rhs2): # If the right operand is integer, we convert it to the left # operand's dtype if isinstance(rh2, Scalar): if ty.is_integer_dtype(rh2.dtype): rh2 = rh2.astype(rh1.dtype) elif ty.is_categorical_dtype(rh1.dtype): rh2 = rh1.dtype.encode(rh2, unwrap=False, can_fail=True) else: common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype) rh1 = rh1.astype(common_dtype) rh2 = rh2.astype(common_dtype) elif not ( ty.is_categorical_dtype(rh1.dtype) or ty.is_categorical_dtype(rh2.dtype) ): common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype) rh1 = rh1.astype(common_dtype) rh2 = rh2.astype(common_dtype) lh_dtype = ty.get_binop_result_type(op, rh1.dtype, rh2.dtype) if ty.is_string_dtype(rh1.dtype) and op in ( "add", "mul", ): raise err._unsupported_error( f"unsupported operand type(s) for {op}: " f"'{rh1.dtype}' and '{rh2.dtype}'" ) if reverse: rh1, rh2 = rh2, rh1 swapped = False if isinstance(rh1, Scalar): rh1, rh2 = rh2, rh1 swapped = True results.append(rh1.binary_op(op, rh2, lh_dtype, swapped=swapped)) return Table(self._runtime, self._index, results)
def _create_column(storage, dtype, ipart=None, nullable=False): column = storage.create_column(dtype, ipart=ipart, nullable=nullable) if ty.is_string_dtype(dtype): offset_storage = storage._runtime.create_output_storage() char_storage = storage._runtime.create_output_storage() column.add_child(offset_storage.create_column(ty.int32, nullable=False)) column.add_child(char_storage.create_column(ty.int8, nullable=False)) column = column.as_string_column() return column
def get_scalar(self): buf = self._future.get_buffer() ( valid, type_code, ) = unpack("ii", buf[:8]) dtype = ty.code_to_dtype(type_code) assert not ty.is_string_dtype(dtype) assert dtype == self.dtype (value, ) = unpack(ty.to_format_string(dtype), buf[8:8 + dtype.itemsize]) return Scalar(self._runtime, dtype, bool(valid), value)
def create_future(self, value, dtype=None): if ty.is_categorical_dtype(dtype): dtype = ty.string if ty.is_string_dtype(dtype): return self.create_future_from_string(value) pandas_dtype = (dtype.storage_dtype.to_pandas() if dtype is not None else None) result = Future() value = numpy.array(value, dtype=pandas_dtype) if ty.is_timestamp_dtype(dtype): value = value.view(dtype.storage_dtype.to_pandas()) result.set_value(self._runtime, value.data, value.nbytes) return PandasFuture(self, result, dtype, ready=True)
def _create_output_column(self, storage, op, input): dtype = ty.get_reduction_result_type(op, input.dtype) nullable = self._nullable_output(input, op) output = storage.create_column(dtype, nullable=nullable) if ty.is_string_dtype(dtype): offset_storage = self._runtime.create_output_storage() char_storage = self._runtime.create_output_storage() output.add_child( offset_storage.create_column(ty.int32, nullable=False)) output.add_child( char_storage.create_column(ty.int8, nullable=False)) output = output.as_string_column() return output
def from_pandas(cls, runtime, column, pandas_index): volume = runtime.create_future(len(pandas_index), ty.int64) if isinstance(pandas_index, pandas.CategoricalIndex): sorted_categories = pandas_index.categories.sort_values() column.data.from_numpy( pandas.CategoricalIndex( list(pandas_index), categories=sorted_categories).values.codes.astype( np.int64)) elif ty.is_string_dtype(column.dtype): return cls(column, volume, pandas_index.name) else: column.data.from_numpy(pandas_index.values) return cls(column, volume, pandas_index.name)
def astype(self, result_dtype, **kwargs): if result_dtype == self.dtype: return self elif ty.is_timestamp_dtype(result_dtype): if self.dtype != ty.string: raise err._unsupported_error( f"astype to {result_dtype} is not yet supported. " "please use to_datetime instead") else: format = "%Y-%m-%d %H:%M:%S" warnings.warn( f"astype from {self.dtype} to {result_dtype} currently " f"uses a fixed format string '{format}' to parse strings. " "please use to_datetime instead if you want the strings " "to be parsed differently.") return self.to_datetime(format) if self.dtype == ty.ts_ns: if not (result_dtype == ty for ty in (ty.int64, ty.string)): raise TypeError("cannot astype a datetimelike from " f"datetime64[ns] to {result_dtype}") if result_dtype == ty.int64: return self.cast_unsafe(result_dtype) runtime = self.runtime result = self.storage.create_column(result_dtype, ipart=self.primary_ipart, nullable=False) if ty.is_string_dtype(result_dtype): offsets_storage = runtime.create_output_storage() chars_storage = runtime.create_output_storage() result.add_child( offsets_storage.create_column(ty.int32, nullable=False)) result.add_child( chars_storage.create_column(ty.int8, nullable=False)) result = result.as_string_column() plan = Map(runtime, OpCode.ASTYPE) result.add_to_plan_output_only(plan) self.add_to_plan(plan, True) plan.execute(result.launch_domain) result.set_bitmask(self.bitmask) return result
def create_scalar(self, value, dtype): if ty.is_categorical_dtype(dtype): dtype = ty.string # Sanitize the value to make it fit to the dtype if value is not None: if isinstance(value, numpy.datetime64): value = value.view("int64") if ty.is_string_dtype(dtype): value = str(value) elif ty.is_integer_dtype(dtype): value = int(value) return Scalar(self, dtype, value is not None, 0 if value is None else value)
def create_similar_column(self, to_imitate, nullable=None): if not to_imitate.partitioned: return to_imitate result = self.create_column( to_imitate.dtype, to_imitate.nullable if nullable is None else nullable, type(to_imitate), ) children_to_copy = to_imitate.children if ty.is_string_dtype(to_imitate.dtype): storages = [ self._runtime.create_output_storage() for _ in children_to_copy ] else: storages = [self] * len(children_to_copy) result.children = [ storage.create_similar_column(child, nullable) for storage, child in zip(storages, children_to_copy) ] return result
def copy_if_else(self, cond, other=None, negate=False): assert cond.dtype == ty.bool has_other = other is not None other_is_scalar = not isinstance(other, Column) if has_other: if other_is_scalar: other = self.runtime.create_scalar(other, self.dtype) elif other.dtype != self.dtype: other = other.astype(self.dtype) runtime = self.runtime plan = Map(runtime, OpCode.COPY_IF_ELSE) nullable = (self.nullable or not has_other or (not other_is_scalar and other.nullable)) lhs = self.storage.create_column(self.dtype, self.primary_ipart, nullable) if ty.is_string_dtype(self.dtype): offsets_storage = runtime.create_output_storage() chars_storage = runtime.create_output_storage() lhs.add_child( offsets_storage.create_column(ty.int32, nullable=False)) lhs.add_child(chars_storage.create_column(ty.int8, nullable=False)) lhs = lhs.as_string_column() lhs.add_to_plan_output_only(plan) self.add_to_plan(plan, True) cond.add_to_plan(plan, True) plan.add_scalar_arg(negate, ty.bool) plan.add_scalar_arg(has_other, ty.bool) if has_other: plan.add_scalar_arg(other_is_scalar, ty.bool) other.add_to_plan(plan, True) plan.execute(self.launch_domain) return lhs
def __init__(self, series): super(StringMethods, self).__init__(series) assert ty.is_string_dtype(self._column.dtype)
def read_parquet(path, columns, **kwargs): from legate.core import Rect from .runtime import _runtime as rt path = util.to_list_if_scalar(path) if len(path) == 1 and os.path.isdir(path[0]): from pyarrow.parquet import ParquetDataset ds = ParquetDataset(path) path = [piece.path for piece in ds.pieces] else: from pyarrow.parquet import ParquetFile ds = ParquetFile(path[0]) if rt.debug: assert all(ParquetFile(p).schema == ds.schema for p in path) dedup_names = set() for name in ds.schema.names: if name in dedup_names: raise ValueError( "Duplicate column names in schema are not supported.") dedup_names.add(name) schema = ds.schema.to_arrow_schema() index_descs = [] index_materialized = False if str.encode("pandas") in ds.metadata.metadata: import json pandas_metadata = json.loads( ds.metadata.metadata[str.encode("pandas")]) index_descs = pandas_metadata["index_columns"] index_materialized = len(index_descs) > 0 and all( isinstance(desc, str) for desc in index_descs) if columns is None: column_names = schema.names elif index_materialized: column_names = columns + index_descs else: column_names = columns for name in column_names: if name not in dedup_names: raise ValueError("Field named %s not found in the schema." % name) schema = [schema.field(name) for name in column_names] del columns storage = rt.create_output_storage() offsets_storage = None columns = [] for column_info in schema: dtype = ty.to_legate_dtype(column_info.type) column = storage.create_column(dtype) if ty.is_string_dtype(dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) column = column.as_string_column() columns.append(column) plan = Map(rt, OpCode.READ_PARQUET) plan.add_scalar_arg(len(path), ty.uint32) for f in path: plan.add_scalar_arg(f, ty.string) plan.add_scalar_arg(len(column_names), ty.uint32) for name in column_names: plan.add_scalar_arg(name, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan size = counts.cast(ty.int64).sum() if index_materialized: to_filter = set(index_descs) index_columns = [] value_columns = [] value_column_names = [] for idx, name in enumerate(column_names): if name in to_filter: index_columns.append(columns[idx]) else: value_columns.append(columns[idx]) value_column_names.append(column_names[idx]) sanitized_names = [ None if name == f"__index_level_{level}__" else name for level, name in enumerate(index_descs) ] index = create_index_from_columns(index_columns, size, sanitized_names) else: value_columns = columns value_column_names = column_names if len(index_descs) > 0: assert len(index_descs) == 1 index_desc = index_descs[0] name = index_desc["name"] start = rt.create_future(index_desc["start"], ty.int64) stop = rt.create_future(index_desc["stop"], ty.int64) step = rt.create_future(index_desc["step"], ty.int64) index = create_range_index(storage, size, name, start, stop, step) else: index = create_range_index(storage, size) from pandas import Index return { "frame": Table(rt, index, value_columns), "columns": Index(value_column_names), }
def read_csv( paths, sep=None, usecols=None, dtypes=None, true_values=None, false_values=None, skiprows=0, skipfooter=0, nrows=None, na_values=None, skip_blank_lines=True, date_cols=False, compressions=None, quotechar='"', quoting=0, doublequote=True, ): from legate.core import Rect from .runtime import _runtime as rt storage = rt.create_output_storage() offsets_storage = None # Override the dtype for category columns, as they are not directly # handled by the CSV reader storage_dtypes = [ ty.string if dtype == "category" else dtype for dtype in dtypes ] columns = [storage.create_column(dtype) for dtype in storage_dtypes] for column in columns: if ty.is_string_dtype(column.dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) columns = [ column.as_string_column() if ty.is_string_dtype(column.dtype) else column for column in columns ] # TODO: Since Arrow doesn't support in-flight decompression, we decompress # any compressed files before tossing them to the reader. to_remove = [] if not rt.has_gpus: paths, compressions, to_remove = _uncompress_files(paths, compressions) plan = Map(rt, OpCode.READ_CSV) plan.add_scalar_arg(len(paths), ty.uint32) for path in paths: plan.add_scalar_arg(path, ty.string) plan.add_scalar_arg(len(compressions), ty.uint32) for compression in compressions: plan.add_scalar_arg(compression.value, ty.int32) plan.add_scalar_arg(sep, ty.string) plan.add_scalar_arg(skiprows, ty.int32) plan.add_scalar_arg(skipfooter, ty.int32) _may_add_to_plan(plan, nrows, ty.int32) plan.add_scalar_arg(quotechar, ty.string) plan.add_scalar_arg(doublequote, ty.bool) plan.add_scalar_arg(skip_blank_lines, ty.bool) _may_add_to_plan(plan, true_values, ty.string) _may_add_to_plan(plan, false_values, ty.string) _may_add_to_plan(plan, na_values, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) plan.add_scalar_arg(len(date_cols), ty.uint32) for idx in date_cols: plan.add_scalar_arg(idx, ty.int32) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan columns = [ column.to_category_column() if dtype == "category" else column for column, dtype in zip(columns, dtypes) ] size = counts.cast(ty.int64).sum() index = create_range_index(storage, size) if len(to_remove) > 0: counts.wait() for path in to_remove: os.remove(path) return Table(rt, index, columns)
def to_pandas(self, schema_only=False): if ty.is_string_dtype(self.dtype): return self.as_string_column().to_pandas(schema_only) else: raise ValueError("Unsupported dtype %s" % self.dtype)