示例#1
0
文件: data.py 项目: crytaljin/modin
    def from_arrow(cls, at):
        (
            new_frame,
            new_lengths,
            new_widths,
            unsupported_cols,
        ) = cls._frame_mgr_cls.from_arrow(at, return_dims=True)

        new_columns = pd.Index(data=at.column_names, dtype="O")
        new_index = pd.RangeIndex(at.num_rows)
        new_dtypes = pd.Series(
            [cls._arrow_type_to_dtype(col.type) for col in at.columns],
            index=new_columns,
        )

        if len(unsupported_cols) > 0:
            ErrorMessage.single_warning(
                f"Frame contain columns with unsupported data-types: {unsupported_cols}. "
                "All operations with this frame will be default to pandas!")

        return cls(
            partitions=new_frame,
            index=new_index,
            columns=new_columns,
            row_lengths=new_lengths,
            column_widths=new_widths,
            dtypes=new_dtypes,
            has_unsupported_data=len(unsupported_cols) > 0,
        )
示例#2
0
    def check_parameters_support(
        cls,
        filepath_or_buffer,
        read_kwargs: dict,
        skiprows_md: Union[Sequence, callable, int],
        header_size: int,
    ) -> bool:
        """
        Check support of only general parameters of `read_*` function.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of `read_*` function.
        read_kwargs : dict
            Parameters of `read_*` function.
        skiprows_md : int, array or callable
            `skiprows` parameter modified for easier handling by Modin.
        header_size : int
            Number of rows that are used by header.

        Returns
        -------
        bool
            Whether passed parameters are supported or not.
        """
        skiprows = read_kwargs.get("skiprows")
        if isinstance(filepath_or_buffer, str):
            if not cls.file_exists(filepath_or_buffer):
                return False
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return False

        if read_kwargs["chunksize"] is not None:
            return False

        skiprows_supported = True
        if is_list_like(skiprows_md) and skiprows_md[0] < header_size:
            skiprows_supported = False
        elif callable(skiprows):
            # check if `skiprows` callable gives True for any of header indices
            is_intersection = any(
                cls._get_skip_mask(pandas.RangeIndex(header_size), skiprows))
            if is_intersection:
                skiprows_supported = False

        if not skiprows_supported:
            ErrorMessage.single_warning(
                "Values of `header` and `skiprows` parameters have intersections. "
                +
                "This case is unsupported by Modin, so pandas implementation will be used"
            )
            return False

        return True
示例#3
0
文件: data.py 项目: crytaljin/modin
    def from_pandas(cls, df):
        new_index = df.index
        new_columns = df.columns
        # If there is non-trivial index, we put it into columns.
        # That's what we usually have for arrow tables and execution
        # result. Unnamed index is renamed to __index__. Also all
        # columns get 'F_' prefix to handle names unsupported in
        # OmniSci.
        if cls._is_trivial_index(df.index):
            index_cols = None
        else:
            orig_index_names = df.index.names
            orig_df = df

            index_cols = [
                f"__index__{i}_{'__None__' if n is None else n}"
                for i, n in enumerate(df.index.names)
            ]
            df.index.names = index_cols
            df = df.reset_index()

            orig_df.index.names = orig_index_names
        new_dtypes = df.dtypes
        df = df.add_prefix("F_")

        (
            new_parts,
            new_lengths,
            new_widths,
            unsupported_cols,
        ) = cls._frame_mgr_cls.from_pandas(df, True)

        if len(unsupported_cols) > 0:
            ErrorMessage.single_warning(
                f"Frame contain columns with unsupported data-types: {unsupported_cols}. "
                "All operations with this frame will be default to pandas!")

        return cls(
            new_parts,
            new_index,
            new_columns,
            new_lengths,
            new_widths,
            dtypes=new_dtypes,
            index_cols=index_cols,
            has_unsupported_data=len(unsupported_cols) > 0,
        )
示例#4
0
    def cast_to_compatible_types(table):
        """
        Cast PyArrow table to be fully compatible with OmniSci.

        Parameters
        ----------
        table : pyarrow.Table
            Source table.

        Returns
        -------
        pyarrow.Table
            Table with fully compatible types with OmniSci.
        """
        schema = table.schema
        new_schema = schema
        need_cast = False
        uint_to_int_cast = False
        new_cols = {}
        uint_to_int_map = {
            pa.uint8(): pa.int16(),
            pa.uint16(): pa.int32(),
            pa.uint32(): pa.int64(),
            pa.uint64(): pa.int64(),  # May cause overflow
        }
        for i, field in enumerate(schema):
            # Currently OmniSci doesn't support Arrow table import with
            # dictionary columns. Here we cast dictionaries until support
            # is in place.
            # https://github.com/modin-project/modin/issues/1738
            if pa.types.is_dictionary(field.type):
                # Conversion for dictionary of null type to string is not supported
                # in Arrow. Build new column for this case for now.
                if pa.types.is_null(field.type.value_type):
                    mask = np.full(table.num_rows, True, dtype=bool)
                    new_col_data = np.empty(table.num_rows, dtype=str)
                    new_col = pa.array(new_col_data, pa.string(), mask)
                    new_cols[i] = new_col
                else:
                    need_cast = True
                new_field = pa.field(field.name, pa.string(), field.nullable,
                                     field.metadata)
                new_schema = new_schema.set(i, new_field)
            # OmniSci doesn't support importing Arrow's date type:
            # https://github.com/omnisci/omniscidb/issues/678
            elif pa.types.is_date(field.type):
                # Arrow's date is the number of days since the UNIX-epoch, so we can convert it
                # to a timestamp[s] (number of seconds since the UNIX-epoch) without losing precision
                new_field = pa.field(field.name, pa.timestamp("s"),
                                     field.nullable, field.metadata)
                new_schema = new_schema.set(i, new_field)
                need_cast = True
            # OmniSci doesn't support unsigned types
            elif pa.types.is_unsigned_integer(field.type):
                new_field = pa.field(
                    field.name,
                    uint_to_int_map[field.type],
                    field.nullable,
                    field.metadata,
                )
                new_schema = new_schema.set(i, new_field)
                need_cast = True
                uint_to_int_cast = True

        # Such cast may affect the data, so we have to raise a warning about it
        if uint_to_int_cast:
            ErrorMessage.single_warning(
                "OmniSci does not support unsigned integer types, such types will be rounded up to the signed equivalent."
            )

        for i, col in new_cols.items():
            table = table.set_column(i, new_schema[i], col)

        if need_cast:
            try:
                table = table.cast(new_schema)
            except pa.lib.ArrowInvalid as e:
                raise (
                    OverflowError if uint_to_int_cast else RuntimeError
                )("An error occurred when trying to convert unsupported by OmniSci 'dtypes' "
                  +
                  f"to the supported ones, the schema to cast was: \n{new_schema}."
                  ) from e

        return table
示例#5
0
文件: utils.py 项目: timgates42/modin
def initialize_ray(
    override_is_cluster=False,
    override_redis_address: str = None,
    override_redis_password: str = None,
):
    """
    Initializes ray based on parameters, environment variables and internal defaults.

    Parameters
    ----------
    override_is_cluster: bool, optional
        Whether to override the detection of Moding being run in a cluster
        and always assume this runs on cluster head node.
        This also overrides Ray worker detection and always runs the function,
        not only from main thread.
        If not specified, $MODIN_RAY_CLUSTER env variable is used.
    override_redis_address: str, optional
        What Redis address to connect to when running in Ray cluster.
        If not specified, $MODIN_REDIS_ADDRESS is used.
    override_redis_password: str, optional
        What password to use when connecting to Redis.
        If not specified, a new random one is generated.
    """
    import ray

    if threading.current_thread().name == "MainThread" or override_is_cluster:
        import secrets

        cluster = override_is_cluster or IsRayCluster.get()
        redis_address = override_redis_address or RayRedisAddress.get()
        redis_password = override_redis_password or secrets.token_hex(32)

        if cluster:
            # We only start ray in a cluster setting for the head node.
            ray.init(
                address=redis_address or "auto",
                include_dashboard=False,
                ignore_reinit_error=True,
                _redis_password=redis_password,
                logging_level=100,
            )
        else:
            object_store_memory = Memory.get()
            plasma_directory = RayPlasmaDir.get()
            if IsOutOfCore.get():
                if plasma_directory is None:
                    from tempfile import gettempdir

                    plasma_directory = gettempdir()
                # We may have already set the memory from the environment variable, we don't
                # want to overwrite that value if we have.
                if object_store_memory is None:
                    # Round down to the nearest Gigabyte.
                    mem_bytes = ray.utils.get_system_memory() // 10**9 * 10**9
                    # Default to 8x memory for out of core
                    object_store_memory = 8 * mem_bytes
            # In case anything failed above, we can still improve the memory for Modin.
            if object_store_memory is None:
                # Round down to the nearest Gigabyte.
                object_store_memory = int(
                    0.6 * ray.utils.get_system_memory() // 10**9 * 10**9)
                # If the memory pool is smaller than 2GB, just use the default in ray.
                if object_store_memory == 0:
                    object_store_memory = None
            else:
                object_store_memory = int(object_store_memory)
            ray.init(
                num_cpus=CpuCount.get(),
                include_dashboard=False,
                ignore_reinit_error=True,
                _plasma_directory=plasma_directory,
                object_store_memory=object_store_memory,
                address=redis_address,
                _redis_password=redis_password,
                logging_level=100,
                _memory=object_store_memory,
                _lru_evict=True,
            )

            global_node = ray.worker._global_node
            # Check only for head node
            if global_node.head:
                import psutil
                from modin.error_message import ErrorMessage

                ray_session_dir = os.path.dirname(global_node._session_dir)
                ray_free_space_GB = psutil.disk_usage(
                    ray_session_dir).free // 10**9
                ErrorMessage.single_warning(
                    f"Modin Ray engine was started with {ray_free_space_GB} GB free space avaliable, "
                    "if it is not enough for your application, please set environment variable "
                    "MODIN_ON_RAY_PLASMA_DIR=/directory/without/space/limiting"
                )

        _move_stdlib_ahead_of_site_packages()
        ray.worker.global_worker.run_function_on_all_workers(
            _move_stdlib_ahead_of_site_packages)

        ray.worker.global_worker.run_function_on_all_workers(_import_pandas)