예제 #1
0
    def set_primary_ipart(self, ipart, recurse=True):
        self.data.set_primary_ipart(ipart)

        if self.nullable:
            self.bitmask.set_primary_ipart(ipart)

        if recurse:
            if ty.is_string_dtype(self.dtype):
                ranges = self.children[0]
                values = self.children[1]

                ranges.set_primary_ipart(ipart)

                assert ty.is_range_dtype(ranges.data.dtype)
                values.set_primary_ipart(
                    self.runtime.create_partition_by_image(
                        values.ispace,
                        self.cspace,
                        ranges.data,
                        ipart,
                        kind=legion.DISJOINT_COMPLETE_KIND,
                        range=True,
                    ))
            else:
                for child in self.children:
                    child.set_primary_ipart(ipart)
예제 #2
0
    def create_future_from_scalar(self, scalar):
        if scalar.valid:
            if ty.is_string_dtype(scalar.dtype):
                size = len(scalar._value)
                buf = struct.pack(
                    f"iiQ{size}s",
                    scalar.valid,
                    ty.encode_dtype(scalar.dtype),
                    size,
                    scalar._value.encode("utf-8"),
                )
            else:
                fmt = ty.to_format_string(scalar.dtype.storage_dtype)
                buf = struct.pack(
                    "ii" + fmt,
                    scalar.valid,
                    ty.encode_dtype(scalar.dtype),
                    scalar._value,
                )
        else:
            buf = struct.pack("iiQ", scalar.valid,
                              ty.encode_dtype(scalar.dtype), 0)

        fut = Future()
        fut.set_value(self._runtime, buf, len(buf))
        return PandasFuture(self, fut, scalar.dtype, True)
예제 #3
0
 def all_to_offsets(self):
     children = [child.all_to_offsets() for child in self.children]
     if ty.is_string_dtype(self.dtype):
         children[0] = children[0].to_offsets()
     new_self = self.clone(shallow=True)
     new_self.set_primary_ipart(self.primary_ipart, recurse=False)
     new_self.children = children
     return new_self
예제 #4
0
    def binary_op(self, op, other):
        reverse = False
        if op in _REVERSED_OPS:
            op = op[1:]
            reverse = True

        # Perform binary operation
        rhs1 = self._columns
        if is_scalar(other):
            other = self._runtime.create_scalar(other, ty.infer_dtype(other))
            rhs2 = [other] * len(rhs1)
        else:
            rhs2 = other._columns

        results = []
        for rh1, rh2 in zip(rhs1, rhs2):
            # If the right operand is integer, we convert it to the left
            # operand's dtype
            if isinstance(rh2, Scalar):
                if ty.is_integer_dtype(rh2.dtype):
                    rh2 = rh2.astype(rh1.dtype)
                elif ty.is_categorical_dtype(rh1.dtype):
                    rh2 = rh1.dtype.encode(rh2, unwrap=False, can_fail=True)
                else:
                    common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype)
                    rh1 = rh1.astype(common_dtype)
                    rh2 = rh2.astype(common_dtype)

            elif not (
                ty.is_categorical_dtype(rh1.dtype)
                or ty.is_categorical_dtype(rh2.dtype)
            ):
                common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype)
                rh1 = rh1.astype(common_dtype)
                rh2 = rh2.astype(common_dtype)

            lh_dtype = ty.get_binop_result_type(op, rh1.dtype, rh2.dtype)

            if ty.is_string_dtype(rh1.dtype) and op in (
                "add",
                "mul",
            ):
                raise err._unsupported_error(
                    f"unsupported operand type(s) for {op}: "
                    f"'{rh1.dtype}' and '{rh2.dtype}'"
                )

            if reverse:
                rh1, rh2 = rh2, rh1

            swapped = False
            if isinstance(rh1, Scalar):
                rh1, rh2 = rh2, rh1
                swapped = True

            results.append(rh1.binary_op(op, rh2, lh_dtype, swapped=swapped))

        return Table(self._runtime, self._index, results)
예제 #5
0
def _create_column(storage, dtype, ipart=None, nullable=False):
    column = storage.create_column(dtype, ipart=ipart, nullable=nullable)
    if ty.is_string_dtype(dtype):
        offset_storage = storage._runtime.create_output_storage()
        char_storage = storage._runtime.create_output_storage()
        column.add_child(offset_storage.create_column(ty.int32,
                                                      nullable=False))
        column.add_child(char_storage.create_column(ty.int8, nullable=False))
        column = column.as_string_column()
    return column
예제 #6
0
 def get_scalar(self):
     buf = self._future.get_buffer()
     (
         valid,
         type_code,
     ) = unpack("ii", buf[:8])
     dtype = ty.code_to_dtype(type_code)
     assert not ty.is_string_dtype(dtype)
     assert dtype == self.dtype
     (value, ) = unpack(ty.to_format_string(dtype),
                        buf[8:8 + dtype.itemsize])
     return Scalar(self._runtime, dtype, bool(valid), value)
예제 #7
0
 def create_future(self, value, dtype=None):
     if ty.is_categorical_dtype(dtype):
         dtype = ty.string
     if ty.is_string_dtype(dtype):
         return self.create_future_from_string(value)
     pandas_dtype = (dtype.storage_dtype.to_pandas()
                     if dtype is not None else None)
     result = Future()
     value = numpy.array(value, dtype=pandas_dtype)
     if ty.is_timestamp_dtype(dtype):
         value = value.view(dtype.storage_dtype.to_pandas())
     result.set_value(self._runtime, value.data, value.nbytes)
     return PandasFuture(self, result, dtype, ready=True)
예제 #8
0
 def _create_output_column(self, storage, op, input):
     dtype = ty.get_reduction_result_type(op, input.dtype)
     nullable = self._nullable_output(input, op)
     output = storage.create_column(dtype, nullable=nullable)
     if ty.is_string_dtype(dtype):
         offset_storage = self._runtime.create_output_storage()
         char_storage = self._runtime.create_output_storage()
         output.add_child(
             offset_storage.create_column(ty.int32, nullable=False))
         output.add_child(
             char_storage.create_column(ty.int8, nullable=False))
         output = output.as_string_column()
     return output
예제 #9
0
 def from_pandas(cls, runtime, column, pandas_index):
     volume = runtime.create_future(len(pandas_index), ty.int64)
     if isinstance(pandas_index, pandas.CategoricalIndex):
         sorted_categories = pandas_index.categories.sort_values()
         column.data.from_numpy(
             pandas.CategoricalIndex(
                 list(pandas_index),
                 categories=sorted_categories).values.codes.astype(
                     np.int64))
     elif ty.is_string_dtype(column.dtype):
         return cls(column, volume, pandas_index.name)
     else:
         column.data.from_numpy(pandas_index.values)
     return cls(column, volume, pandas_index.name)
예제 #10
0
    def astype(self, result_dtype, **kwargs):
        if result_dtype == self.dtype:
            return self
        elif ty.is_timestamp_dtype(result_dtype):
            if self.dtype != ty.string:
                raise err._unsupported_error(
                    f"astype to {result_dtype} is not yet supported. "
                    "please use to_datetime instead")
            else:
                format = "%Y-%m-%d %H:%M:%S"
                warnings.warn(
                    f"astype from {self.dtype} to {result_dtype} currently "
                    f"uses a fixed format string '{format}' to parse strings. "
                    "please use to_datetime instead if you want the strings "
                    "to be parsed differently.")
                return self.to_datetime(format)

        if self.dtype == ty.ts_ns:
            if not (result_dtype == ty for ty in (ty.int64, ty.string)):
                raise TypeError("cannot astype a datetimelike from "
                                f"datetime64[ns] to {result_dtype}")

            if result_dtype == ty.int64:
                return self.cast_unsafe(result_dtype)

        runtime = self.runtime

        result = self.storage.create_column(result_dtype,
                                            ipart=self.primary_ipart,
                                            nullable=False)
        if ty.is_string_dtype(result_dtype):
            offsets_storage = runtime.create_output_storage()
            chars_storage = runtime.create_output_storage()
            result.add_child(
                offsets_storage.create_column(ty.int32, nullable=False))
            result.add_child(
                chars_storage.create_column(ty.int8, nullable=False))
            result = result.as_string_column()

        plan = Map(runtime, OpCode.ASTYPE)

        result.add_to_plan_output_only(plan)
        self.add_to_plan(plan, True)

        plan.execute(result.launch_domain)

        result.set_bitmask(self.bitmask)

        return result
예제 #11
0
    def create_scalar(self, value, dtype):
        if ty.is_categorical_dtype(dtype):
            dtype = ty.string

        # Sanitize the value to make it fit to the dtype
        if value is not None:
            if isinstance(value, numpy.datetime64):
                value = value.view("int64")

            if ty.is_string_dtype(dtype):
                value = str(value)
            elif ty.is_integer_dtype(dtype):
                value = int(value)

        return Scalar(self, dtype, value is not None,
                      0 if value is None else value)
예제 #12
0
 def create_similar_column(self, to_imitate, nullable=None):
     if not to_imitate.partitioned:
         return to_imitate
     result = self.create_column(
         to_imitate.dtype,
         to_imitate.nullable if nullable is None else nullable,
         type(to_imitate),
     )
     children_to_copy = to_imitate.children
     if ty.is_string_dtype(to_imitate.dtype):
         storages = [
             self._runtime.create_output_storage() for _ in children_to_copy
         ]
     else:
         storages = [self] * len(children_to_copy)
     result.children = [
         storage.create_similar_column(child, nullable)
         for storage, child in zip(storages, children_to_copy)
     ]
     return result
예제 #13
0
    def copy_if_else(self, cond, other=None, negate=False):
        assert cond.dtype == ty.bool

        has_other = other is not None
        other_is_scalar = not isinstance(other, Column)

        if has_other:
            if other_is_scalar:
                other = self.runtime.create_scalar(other, self.dtype)
            elif other.dtype != self.dtype:
                other = other.astype(self.dtype)

        runtime = self.runtime

        plan = Map(runtime, OpCode.COPY_IF_ELSE)

        nullable = (self.nullable or not has_other
                    or (not other_is_scalar and other.nullable))
        lhs = self.storage.create_column(self.dtype, self.primary_ipart,
                                         nullable)
        if ty.is_string_dtype(self.dtype):
            offsets_storage = runtime.create_output_storage()
            chars_storage = runtime.create_output_storage()
            lhs.add_child(
                offsets_storage.create_column(ty.int32, nullable=False))
            lhs.add_child(chars_storage.create_column(ty.int8, nullable=False))
            lhs = lhs.as_string_column()

        lhs.add_to_plan_output_only(plan)
        self.add_to_plan(plan, True)
        cond.add_to_plan(plan, True)
        plan.add_scalar_arg(negate, ty.bool)
        plan.add_scalar_arg(has_other, ty.bool)
        if has_other:
            plan.add_scalar_arg(other_is_scalar, ty.bool)
            other.add_to_plan(plan, True)
        plan.execute(self.launch_domain)

        return lhs
예제 #14
0
 def __init__(self, series):
     super(StringMethods, self).__init__(series)
     assert ty.is_string_dtype(self._column.dtype)
예제 #15
0
def read_parquet(path, columns, **kwargs):
    from legate.core import Rect

    from .runtime import _runtime as rt

    path = util.to_list_if_scalar(path)

    if len(path) == 1 and os.path.isdir(path[0]):
        from pyarrow.parquet import ParquetDataset

        ds = ParquetDataset(path)
        path = [piece.path for piece in ds.pieces]
    else:
        from pyarrow.parquet import ParquetFile

        ds = ParquetFile(path[0])
        if rt.debug:
            assert all(ParquetFile(p).schema == ds.schema for p in path)

    dedup_names = set()
    for name in ds.schema.names:
        if name in dedup_names:
            raise ValueError(
                "Duplicate column names in schema are not supported.")
        dedup_names.add(name)

    schema = ds.schema.to_arrow_schema()
    index_descs = []
    index_materialized = False
    if str.encode("pandas") in ds.metadata.metadata:
        import json

        pandas_metadata = json.loads(
            ds.metadata.metadata[str.encode("pandas")])
        index_descs = pandas_metadata["index_columns"]
        index_materialized = len(index_descs) > 0 and all(
            isinstance(desc, str) for desc in index_descs)

    if columns is None:
        column_names = schema.names
    elif index_materialized:
        column_names = columns + index_descs
    else:
        column_names = columns

    for name in column_names:
        if name not in dedup_names:
            raise ValueError("Field named %s not found in the schema." % name)
    schema = [schema.field(name) for name in column_names]
    del columns

    storage = rt.create_output_storage()
    offsets_storage = None

    columns = []
    for column_info in schema:
        dtype = ty.to_legate_dtype(column_info.type)
        column = storage.create_column(dtype)
        if ty.is_string_dtype(dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
            column = column.as_string_column()
        columns.append(column)

    plan = Map(rt, OpCode.READ_PARQUET)
    plan.add_scalar_arg(len(path), ty.uint32)
    for f in path:
        plan.add_scalar_arg(f, ty.string)
    plan.add_scalar_arg(len(column_names), ty.uint32)
    for name in column_names:
        plan.add_scalar_arg(name, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    size = counts.cast(ty.int64).sum()

    if index_materialized:
        to_filter = set(index_descs)

        index_columns = []
        value_columns = []
        value_column_names = []
        for idx, name in enumerate(column_names):
            if name in to_filter:
                index_columns.append(columns[idx])
            else:
                value_columns.append(columns[idx])
                value_column_names.append(column_names[idx])

        sanitized_names = [
            None if name == f"__index_level_{level}__" else name
            for level, name in enumerate(index_descs)
        ]
        index = create_index_from_columns(index_columns, size, sanitized_names)
    else:
        value_columns = columns
        value_column_names = column_names
        if len(index_descs) > 0:
            assert len(index_descs) == 1
            index_desc = index_descs[0]
            name = index_desc["name"]
            start = rt.create_future(index_desc["start"], ty.int64)
            stop = rt.create_future(index_desc["stop"], ty.int64)
            step = rt.create_future(index_desc["step"], ty.int64)
            index = create_range_index(storage, size, name, start, stop, step)
        else:
            index = create_range_index(storage, size)

    from pandas import Index

    return {
        "frame": Table(rt, index, value_columns),
        "columns": Index(value_column_names),
    }
예제 #16
0
def read_csv(
    paths,
    sep=None,
    usecols=None,
    dtypes=None,
    true_values=None,
    false_values=None,
    skiprows=0,
    skipfooter=0,
    nrows=None,
    na_values=None,
    skip_blank_lines=True,
    date_cols=False,
    compressions=None,
    quotechar='"',
    quoting=0,
    doublequote=True,
):
    from legate.core import Rect

    from .runtime import _runtime as rt

    storage = rt.create_output_storage()
    offsets_storage = None

    # Override the dtype for category columns, as they are not directly
    # handled by the CSV reader
    storage_dtypes = [
        ty.string if dtype == "category" else dtype for dtype in dtypes
    ]
    columns = [storage.create_column(dtype) for dtype in storage_dtypes]
    for column in columns:
        if ty.is_string_dtype(column.dtype):
            if offsets_storage is None:
                offsets_storage = rt.create_output_storage()
            offsets_column = offsets_storage.create_column(ty.int32,
                                                           nullable=False)
            chars_storage = rt.create_output_storage()
            char_column = chars_storage.create_column(ty.int8, nullable=False)
            column.add_child(offsets_column)
            column.add_child(char_column)
    columns = [
        column.as_string_column()
        if ty.is_string_dtype(column.dtype) else column for column in columns
    ]

    # TODO: Since Arrow doesn't support in-flight decompression, we decompress
    #       any compressed files before tossing them to the reader.
    to_remove = []
    if not rt.has_gpus:
        paths, compressions, to_remove = _uncompress_files(paths, compressions)

    plan = Map(rt, OpCode.READ_CSV)
    plan.add_scalar_arg(len(paths), ty.uint32)
    for path in paths:
        plan.add_scalar_arg(path, ty.string)
    plan.add_scalar_arg(len(compressions), ty.uint32)
    for compression in compressions:
        plan.add_scalar_arg(compression.value, ty.int32)
    plan.add_scalar_arg(sep, ty.string)
    plan.add_scalar_arg(skiprows, ty.int32)
    plan.add_scalar_arg(skipfooter, ty.int32)
    _may_add_to_plan(plan, nrows, ty.int32)
    plan.add_scalar_arg(quotechar, ty.string)
    plan.add_scalar_arg(doublequote, ty.bool)
    plan.add_scalar_arg(skip_blank_lines, ty.bool)
    _may_add_to_plan(plan, true_values, ty.string)
    _may_add_to_plan(plan, false_values, ty.string)
    _may_add_to_plan(plan, na_values, ty.string)
    plan.add_scalar_arg(len(columns), ty.uint32)
    for column in columns:
        column.add_to_plan_output_only(plan)
    plan.add_scalar_arg(len(date_cols), ty.uint32)
    for idx in date_cols:
        plan.add_scalar_arg(idx, ty.int32)
    counts = plan.execute(Rect([rt.num_pieces]))
    storage = plan.promote_output_storage(storage)
    rt.register_external_weighted_partition(storage.default_ipart, counts)
    del plan

    columns = [
        column.to_category_column() if dtype == "category" else column
        for column, dtype in zip(columns, dtypes)
    ]

    size = counts.cast(ty.int64).sum()
    index = create_range_index(storage, size)

    if len(to_remove) > 0:
        counts.wait()
        for path in to_remove:
            os.remove(path)

    return Table(rt, index, columns)
예제 #17
0
 def to_pandas(self, schema_only=False):
     if ty.is_string_dtype(self.dtype):
         return self.as_string_column().to_pandas(schema_only)
     else:
         raise ValueError("Unsupported dtype %s" % self.dtype)