def _create_index_space(self, rect): if not isinstance(rect, PandasFuture): if not isinstance(rect, Rect): rect = Rect([rect]) handle = legion.legion_index_space_create_domain( self._runtime, self._context, rect.raw()) else: domain = self.launch_future_task(OpCode.LIFT_TO_DOMAIN, rect) handle = legion.legion_index_space_create_future( self._runtime, self._context, 1, domain.handle, 0) return IndexSpace(self._context, self._runtime, handle=handle)
def _drop_duplicates_one_step(runtime, inputs, subset, keep, radix=1): storage = runtime.create_output_storage() outputs = [storage.create_similar_column(column) for column in inputs] num_pieces = (inputs[0].num_pieces + radix - 1) // radix launch_domain = Rect([num_pieces]) plan = Map(runtime, OpCode.DROP_DUPLICATES_TREE) plan.add_scalar_arg(keep.value, ty.int32) plan.add_scalar_arg(len(subset), ty.uint32) for idx in subset: plan.add_scalar_arg(idx, ty.int32) plan.add_scalar_arg(radix, ty.uint32) for r in range(radix): plan.add_scalar_arg(len(inputs), ty.uint32) proj_id = runtime.get_radix_functor_id(radix, r) for input in inputs: input.add_to_plan(plan, True, proj=proj_id) plan.add_scalar_arg(len(outputs), ty.uint32) for output in outputs: output.add_to_plan_output_only(plan) counts = plan.execute(launch_domain) storage = plan.promote_output_storage(storage) return (outputs, storage, counts, outputs[0].num_pieces)
def _construct_groupby_output(self): result = self._perform_reduction() if self._method == GroupbyVariantCode.HASH: # The input table is already partitioned so that chunks have # disjoint keys, so we only need a single round of reduction return result elif self._method == GroupbyVariantCode.TREE: # If we do tree-based reduction, we need to repeat reduction # rounds until we reach the root of the tree self._radix = self._runtime.radix while self._num_pieces > 1: (self._keys, self._values, total_count) = result self._num_pieces = (self._num_pieces + self._radix - 1) // self._radix self._launch_domain = Rect([self._num_pieces]) self._cspace = self._runtime.find_or_create_color_space( self._num_pieces) result = self._perform_reduction() return result else: assert False
def to_category_column_cpu(self, dtype): rt = self.runtime nullable = dtype is not None or self.nullable if dtype is None: # Local de-duplication storage = rt.create_output_storage() result_column = storage.create_similar_column(self, nullable=False) plan = Map(rt, OpCode.DROP_DUPLICATES_CATEGORIES) plan.add_scalar_arg(1, ty.uint32) result_column.add_to_plan_output_only(plan) self.add_to_plan(plan, True) plan.execute(self.launch_domain) del plan radix = rt.radix num_pieces = result_column.num_pieces while num_pieces > 1: # Global de-duplication num_pieces = (num_pieces + radix - 1) // radix local_dedup_column = result_column storage = rt.create_output_storage() result_column = storage.create_similar_column(self, nullable=False) plan = Map(rt, OpCode.DROP_DUPLICATES_CATEGORIES) plan.add_scalar_arg(radix, ty.uint32) result_column.add_to_plan_output_only(plan) for r in range(radix): proj_id = rt.get_radix_functor_id(radix, r) local_dedup_column.add_to_plan(plan, True, proj=proj_id) launch_domain = Rect([num_pieces]) plan.execute(launch_domain) del plan categories_column = result_column.as_replicated_column() dtype = ty.CategoricalDtype(categories_column) encode_result = self.storage.create_column(dtype, ipart=self.primary_ipart, nullable=nullable) encode_result.add_child( self.storage.create_column( ty.uint32, ipart=self.primary_ipart, nullable=False, )) plan = Map(rt, OpCode.ENCODE) encode_result.add_to_plan_output_only(plan) dtype.categories_column.add_to_plan(plan, True) self.add_to_plan(plan, True) plan.execute(self.launch_domain) del plan encode_result.add_child(dtype.categories_column) return encode_result.as_category_column()
def _shuffle_columns(self): (self._key_columns, self._sample_columns) = self._sample_keys() rt = self._runtime cspace = self._input_columns[0].cspace hist_ispace = rt.find_or_create_index_space( Rect([self._num_pieces, self._num_pieces])) hist_storage = rt.create_storage(hist_ispace) hist = hist_storage.create_new_field(ty.range64) hist_ipart = rt.create_row_partition(hist_ispace, cspace, self._num_pieces) # Build histogram using samples. Each point task # gets the whole set of samples and sorts them independently. plan = Map(rt, OpCode.BUILD_HISTOGRAM) plan.add_scalar_arg(self._num_pieces, ty.uint32) plan.add_scalar_arg(self._put_null_first, ty.bool) plan.add_scalar_arg(len(self._key_columns), ty.uint32) for asc in self._ascending: plan.add_scalar_arg(asc, ty.bool) # Need to broadcast the whole sample region samples = [sample.repartition(1) for sample in self._sample_columns] for column in samples: column.add_to_plan(plan, True, proj=None) for column in self._key_columns: column.add_to_plan(plan, True) plan.add_output( hist, Projection(hist_ipart), tag=PandasMappingTag.HISTOGRAM, flags=2, # LEGION_NO_ACCESS_FLAG ) plan.execute(self._launch_domain) del plan hist_ipart = rt.create_column_partition(hist_ispace, cspace, self._num_pieces) radix_ipart = rt.create_partition_by_image( self._input_columns[0].ispace, cspace, hist, hist_ipart, kind=legion.DISJOINT_COMPLETE_KIND, range=True, ) # Change the primary partitions to shuffle the data input_columns = [ column.all_to_ranges().clone() for column in self._input_columns ] for column in input_columns: column.set_primary_ipart(radix_ipart) input_columns = [column.all_to_offsets() for column in input_columns] return input_columns
def _preload_libcudf(self): task = IndexTask( self.get_task_id(OpCode.LIBCUDF_INIT), Rect([self.num_pieces]), argmap=self.empty_argmap, mapper=self.mapper_id, ) self.dispatch(task).wait()
def _finalize_nccl(self): task = IndexTask( self.get_task_id(OpCode.FINALIZE_NCCL), Rect([self.num_pieces]), argmap=self.empty_argmap, mapper=self.mapper_id, ) nccl_comm = self._nccl_comm._future_map task.add_point_future(ArgumentMap(future_map=nccl_comm)) self.dispatch(task).wait()
def _initialize_nccl(self): task = Task( self.get_task_id(OpCode.INIT_NCCL_ID), mapper=self.mapper_id, ) self._nccl_id = self.dispatch(task) task = IndexTask( self.get_task_id(OpCode.INIT_NCCL), Rect([self.num_pieces]), argmap=self.empty_argmap, mapper=self.mapper_id, ) task.add_future(self._nccl_id) self.issue_fence() self._nccl_comm = self.dispatch(task).cast(ty.uint64) self.issue_fence()
def find_or_create_column_partition(self, cspace, num_columns): if cspace not in self._column_partitions: transform = Transform(2, 1) transform.trans[0, 0] = 1 transform.trans[1, 0] = 0 extent = Rect([1, num_columns]) partitioner = PartitionByRestriction(transform, extent) part_id = self._next_column_partition_id self._next_column_partition_id = part_id + 1 ipart = IndexPartition( self._legion_context, self._legion_runtime, self._ispace, cspace, partitioner, kind=legion.DISJOINT_COMPLETE_KIND, part_id=part_id, ) self._column_partitions[cspace] = ipart return self._column_partitions[cspace]
def _hash_partition_cpu(self, columns, key_indices, needs_conversion): storage = self._runtime.create_storage(columns[0].ispace) out_columns = storage.create_isomorphic_columns(columns) _key_indices = list(key_indices) for idx in needs_conversion: _key_indices[key_indices.index(idx)] = len(columns) columns.append(columns[idx].astype(ty.string)) key_indices = _key_indices num_pieces = columns[0].num_pieces launch_domain = columns[0].launch_domain cspace = columns[0].cspace hist_ispace = self._runtime.find_or_create_index_space( Rect([num_pieces, num_pieces])) hist_storage = self._runtime.create_storage(hist_ispace) hist = hist_storage.create_new_field(ty.range64) hist_ipart = self._runtime.create_row_partition( hist_ispace, cspace, num_pieces) plan = Map(self._runtime, OpCode.LOCAL_PARTITION) plan.add_output( hist, Projection(hist_ipart), tag=PandasMappingTag.HISTOGRAM, flags=2, # LEGION_NO_ACCESS_FLAG ) plan.add_scalar_arg(num_pieces, ty.uint32) plan.add_scalar_arg(len(key_indices), ty.uint32) for idx in key_indices: plan.add_scalar_arg(idx, ty.int32) plan.add_scalar_arg(len(columns), ty.uint32) for key in columns: key.add_to_plan(plan, True) plan.add_scalar_arg(len(out_columns), ty.uint32) for key in out_columns: key.add_to_plan_output_only(plan) plan.execute(launch_domain) del plan hist_ipart = self._runtime.create_column_partition( hist_ispace, cspace, num_pieces) radix_ipart = self._runtime.create_partition_by_image( columns[0].ispace, cspace, hist, hist_ipart, kind=legion.DISJOINT_COMPLETE_KIND, range=True, ) out_columns = [ out_column.all_to_ranges().clone() for out_column in out_columns ] for out_column in out_columns: out_column.set_primary_ipart(radix_ipart) out_columns = [ out_column.all_to_offsets() for out_column in out_columns ] return out_columns
def launch_domain(self): return Rect([self.num_pieces])
def read_parquet(path, columns, **kwargs): from legate.core import Rect from .runtime import _runtime as rt path = util.to_list_if_scalar(path) if len(path) == 1 and os.path.isdir(path[0]): from pyarrow.parquet import ParquetDataset ds = ParquetDataset(path) path = [piece.path for piece in ds.pieces] else: from pyarrow.parquet import ParquetFile ds = ParquetFile(path[0]) if rt.debug: assert all(ParquetFile(p).schema == ds.schema for p in path) dedup_names = set() for name in ds.schema.names: if name in dedup_names: raise ValueError( "Duplicate column names in schema are not supported.") dedup_names.add(name) schema = ds.schema.to_arrow_schema() index_descs = [] index_materialized = False if str.encode("pandas") in ds.metadata.metadata: import json pandas_metadata = json.loads( ds.metadata.metadata[str.encode("pandas")]) index_descs = pandas_metadata["index_columns"] index_materialized = len(index_descs) > 0 and all( isinstance(desc, str) for desc in index_descs) if columns is None: column_names = schema.names elif index_materialized: column_names = columns + index_descs else: column_names = columns for name in column_names: if name not in dedup_names: raise ValueError("Field named %s not found in the schema." % name) schema = [schema.field(name) for name in column_names] del columns storage = rt.create_output_storage() offsets_storage = None columns = [] for column_info in schema: dtype = ty.to_legate_dtype(column_info.type) column = storage.create_column(dtype) if ty.is_string_dtype(dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) column = column.as_string_column() columns.append(column) plan = Map(rt, OpCode.READ_PARQUET) plan.add_scalar_arg(len(path), ty.uint32) for f in path: plan.add_scalar_arg(f, ty.string) plan.add_scalar_arg(len(column_names), ty.uint32) for name in column_names: plan.add_scalar_arg(name, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan size = counts.cast(ty.int64).sum() if index_materialized: to_filter = set(index_descs) index_columns = [] value_columns = [] value_column_names = [] for idx, name in enumerate(column_names): if name in to_filter: index_columns.append(columns[idx]) else: value_columns.append(columns[idx]) value_column_names.append(column_names[idx]) sanitized_names = [ None if name == f"__index_level_{level}__" else name for level, name in enumerate(index_descs) ] index = create_index_from_columns(index_columns, size, sanitized_names) else: value_columns = columns value_column_names = column_names if len(index_descs) > 0: assert len(index_descs) == 1 index_desc = index_descs[0] name = index_desc["name"] start = rt.create_future(index_desc["start"], ty.int64) stop = rt.create_future(index_desc["stop"], ty.int64) step = rt.create_future(index_desc["step"], ty.int64) index = create_range_index(storage, size, name, start, stop, step) else: index = create_range_index(storage, size) from pandas import Index return { "frame": Table(rt, index, value_columns), "columns": Index(value_column_names), }
def read_csv( paths, sep=None, usecols=None, dtypes=None, true_values=None, false_values=None, skiprows=0, skipfooter=0, nrows=None, na_values=None, skip_blank_lines=True, date_cols=False, compressions=None, quotechar='"', quoting=0, doublequote=True, ): from legate.core import Rect from .runtime import _runtime as rt storage = rt.create_output_storage() offsets_storage = None # Override the dtype for category columns, as they are not directly # handled by the CSV reader storage_dtypes = [ ty.string if dtype == "category" else dtype for dtype in dtypes ] columns = [storage.create_column(dtype) for dtype in storage_dtypes] for column in columns: if ty.is_string_dtype(column.dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) columns = [ column.as_string_column() if ty.is_string_dtype(column.dtype) else column for column in columns ] # TODO: Since Arrow doesn't support in-flight decompression, we decompress # any compressed files before tossing them to the reader. to_remove = [] if not rt.has_gpus: paths, compressions, to_remove = _uncompress_files(paths, compressions) plan = Map(rt, OpCode.READ_CSV) plan.add_scalar_arg(len(paths), ty.uint32) for path in paths: plan.add_scalar_arg(path, ty.string) plan.add_scalar_arg(len(compressions), ty.uint32) for compression in compressions: plan.add_scalar_arg(compression.value, ty.int32) plan.add_scalar_arg(sep, ty.string) plan.add_scalar_arg(skiprows, ty.int32) plan.add_scalar_arg(skipfooter, ty.int32) _may_add_to_plan(plan, nrows, ty.int32) plan.add_scalar_arg(quotechar, ty.string) plan.add_scalar_arg(doublequote, ty.bool) plan.add_scalar_arg(skip_blank_lines, ty.bool) _may_add_to_plan(plan, true_values, ty.string) _may_add_to_plan(plan, false_values, ty.string) _may_add_to_plan(plan, na_values, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) plan.add_scalar_arg(len(date_cols), ty.uint32) for idx in date_cols: plan.add_scalar_arg(idx, ty.int32) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan columns = [ column.to_category_column() if dtype == "category" else column for column, dtype in zip(columns, dtypes) ] size = counts.cast(ty.int64).sum() index = create_range_index(storage, size) if len(to_remove) > 0: counts.wait() for path in to_remove: os.remove(path) return Table(rt, index, columns)