def from_arrow(cls, at): ( new_frame, new_lengths, new_widths, unsupported_cols, ) = cls._frame_mgr_cls.from_arrow(at, return_dims=True) new_columns = pd.Index(data=at.column_names, dtype="O") new_index = pd.RangeIndex(at.num_rows) new_dtypes = pd.Series( [cls._arrow_type_to_dtype(col.type) for col in at.columns], index=new_columns, ) if len(unsupported_cols) > 0: ErrorMessage.single_warning( f"Frame contain columns with unsupported data-types: {unsupported_cols}. " "All operations with this frame will be default to pandas!") return cls( partitions=new_frame, index=new_index, columns=new_columns, row_lengths=new_lengths, column_widths=new_widths, dtypes=new_dtypes, has_unsupported_data=len(unsupported_cols) > 0, )
def check_parameters_support( cls, filepath_or_buffer, read_kwargs: dict, skiprows_md: Union[Sequence, callable, int], header_size: int, ) -> bool: """ Check support of only general parameters of `read_*` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_*` function. read_kwargs : dict Parameters of `read_*` function. skiprows_md : int, array or callable `skiprows` parameter modified for easier handling by Modin. header_size : int Number of rows that are used by header. Returns ------- bool Whether passed parameters are supported or not. """ skiprows = read_kwargs.get("skiprows") if isinstance(filepath_or_buffer, str): if not cls.file_exists(filepath_or_buffer): return False elif not cls.pathlib_or_pypath(filepath_or_buffer): return False if read_kwargs["chunksize"] is not None: return False skiprows_supported = True if is_list_like(skiprows_md) and skiprows_md[0] < header_size: skiprows_supported = False elif callable(skiprows): # check if `skiprows` callable gives True for any of header indices is_intersection = any( cls._get_skip_mask(pandas.RangeIndex(header_size), skiprows)) if is_intersection: skiprows_supported = False if not skiprows_supported: ErrorMessage.single_warning( "Values of `header` and `skiprows` parameters have intersections. " + "This case is unsupported by Modin, so pandas implementation will be used" ) return False return True
def from_pandas(cls, df): new_index = df.index new_columns = df.columns # If there is non-trivial index, we put it into columns. # That's what we usually have for arrow tables and execution # result. Unnamed index is renamed to __index__. Also all # columns get 'F_' prefix to handle names unsupported in # OmniSci. if cls._is_trivial_index(df.index): index_cols = None else: orig_index_names = df.index.names orig_df = df index_cols = [ f"__index__{i}_{'__None__' if n is None else n}" for i, n in enumerate(df.index.names) ] df.index.names = index_cols df = df.reset_index() orig_df.index.names = orig_index_names new_dtypes = df.dtypes df = df.add_prefix("F_") ( new_parts, new_lengths, new_widths, unsupported_cols, ) = cls._frame_mgr_cls.from_pandas(df, True) if len(unsupported_cols) > 0: ErrorMessage.single_warning( f"Frame contain columns with unsupported data-types: {unsupported_cols}. " "All operations with this frame will be default to pandas!") return cls( new_parts, new_index, new_columns, new_lengths, new_widths, dtypes=new_dtypes, index_cols=index_cols, has_unsupported_data=len(unsupported_cols) > 0, )
def cast_to_compatible_types(table): """ Cast PyArrow table to be fully compatible with OmniSci. Parameters ---------- table : pyarrow.Table Source table. Returns ------- pyarrow.Table Table with fully compatible types with OmniSci. """ schema = table.schema new_schema = schema need_cast = False uint_to_int_cast = False new_cols = {} uint_to_int_map = { pa.uint8(): pa.int16(), pa.uint16(): pa.int32(), pa.uint32(): pa.int64(), pa.uint64(): pa.int64(), # May cause overflow } for i, field in enumerate(schema): # Currently OmniSci doesn't support Arrow table import with # dictionary columns. Here we cast dictionaries until support # is in place. # https://github.com/modin-project/modin/issues/1738 if pa.types.is_dictionary(field.type): # Conversion for dictionary of null type to string is not supported # in Arrow. Build new column for this case for now. if pa.types.is_null(field.type.value_type): mask = np.full(table.num_rows, True, dtype=bool) new_col_data = np.empty(table.num_rows, dtype=str) new_col = pa.array(new_col_data, pa.string(), mask) new_cols[i] = new_col else: need_cast = True new_field = pa.field(field.name, pa.string(), field.nullable, field.metadata) new_schema = new_schema.set(i, new_field) # OmniSci doesn't support importing Arrow's date type: # https://github.com/omnisci/omniscidb/issues/678 elif pa.types.is_date(field.type): # Arrow's date is the number of days since the UNIX-epoch, so we can convert it # to a timestamp[s] (number of seconds since the UNIX-epoch) without losing precision new_field = pa.field(field.name, pa.timestamp("s"), field.nullable, field.metadata) new_schema = new_schema.set(i, new_field) need_cast = True # OmniSci doesn't support unsigned types elif pa.types.is_unsigned_integer(field.type): new_field = pa.field( field.name, uint_to_int_map[field.type], field.nullable, field.metadata, ) new_schema = new_schema.set(i, new_field) need_cast = True uint_to_int_cast = True # Such cast may affect the data, so we have to raise a warning about it if uint_to_int_cast: ErrorMessage.single_warning( "OmniSci does not support unsigned integer types, such types will be rounded up to the signed equivalent." ) for i, col in new_cols.items(): table = table.set_column(i, new_schema[i], col) if need_cast: try: table = table.cast(new_schema) except pa.lib.ArrowInvalid as e: raise ( OverflowError if uint_to_int_cast else RuntimeError )("An error occurred when trying to convert unsupported by OmniSci 'dtypes' " + f"to the supported ones, the schema to cast was: \n{new_schema}." ) from e return table
def initialize_ray( override_is_cluster=False, override_redis_address: str = None, override_redis_password: str = None, ): """ Initializes ray based on parameters, environment variables and internal defaults. Parameters ---------- override_is_cluster: bool, optional Whether to override the detection of Moding being run in a cluster and always assume this runs on cluster head node. This also overrides Ray worker detection and always runs the function, not only from main thread. If not specified, $MODIN_RAY_CLUSTER env variable is used. override_redis_address: str, optional What Redis address to connect to when running in Ray cluster. If not specified, $MODIN_REDIS_ADDRESS is used. override_redis_password: str, optional What password to use when connecting to Redis. If not specified, a new random one is generated. """ import ray if threading.current_thread().name == "MainThread" or override_is_cluster: import secrets cluster = override_is_cluster or IsRayCluster.get() redis_address = override_redis_address or RayRedisAddress.get() redis_password = override_redis_password or secrets.token_hex(32) if cluster: # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", include_dashboard=False, ignore_reinit_error=True, _redis_password=redis_password, logging_level=100, ) else: object_store_memory = Memory.get() plasma_directory = RayPlasmaDir.get() if IsOutOfCore.get(): if plasma_directory is None: from tempfile import gettempdir plasma_directory = gettempdir() # We may have already set the memory from the environment variable, we don't # want to overwrite that value if we have. if object_store_memory is None: # Round down to the nearest Gigabyte. mem_bytes = ray.utils.get_system_memory() // 10**9 * 10**9 # Default to 8x memory for out of core object_store_memory = 8 * mem_bytes # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: # Round down to the nearest Gigabyte. object_store_memory = int( 0.6 * ray.utils.get_system_memory() // 10**9 * 10**9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None else: object_store_memory = int(object_store_memory) ray.init( num_cpus=CpuCount.get(), include_dashboard=False, ignore_reinit_error=True, _plasma_directory=plasma_directory, object_store_memory=object_store_memory, address=redis_address, _redis_password=redis_password, logging_level=100, _memory=object_store_memory, _lru_evict=True, ) global_node = ray.worker._global_node # Check only for head node if global_node.head: import psutil from modin.error_message import ErrorMessage ray_session_dir = os.path.dirname(global_node._session_dir) ray_free_space_GB = psutil.disk_usage( ray_session_dir).free // 10**9 ErrorMessage.single_warning( f"Modin Ray engine was started with {ray_free_space_GB} GB free space avaliable, " "if it is not enough for your application, please set environment variable " "MODIN_ON_RAY_PLASMA_DIR=/directory/without/space/limiting" ) _move_stdlib_ahead_of_site_packages() ray.worker.global_worker.run_function_on_all_workers( _move_stdlib_ahead_of_site_packages) ray.worker.global_worker.run_function_on_all_workers(_import_pandas)