def test_factory_switch(): Engine.put("Test") assert FactoryDispatcher.get_factory() == PandasOnTestFactory assert FactoryDispatcher.get_factory().io_cls == "Foo" Engine.put("Python") # revert engine to default Backend.put("Test") assert FactoryDispatcher.get_factory() == TestOnPythonFactory assert FactoryDispatcher.get_factory().io_cls == "Bar" Backend.put("Pandas") # revert engine to default
def from_partitions(partitions, axis, index=None, columns=None, row_lengths=None, column_widths=None): """ Create DataFrame from remote partitions. Parameters ---------- partitions : list A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used. Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``). axis : {None, 0 or 1} The ``axis`` parameter is used to identify what are the partitions passed. You have to set: * ``axis=0`` if you want to create DataFrame from row partitions * ``axis=1`` if you want to create DataFrame from column partitions * ``axis=None`` if you want to create DataFrame from 2D list of partitions index : sequence, optional The index for the DataFrame. Is computed if not provided. columns : sequence, optional The columns for the DataFrame. Is computed if not provided. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. Returns ------- modin.pandas.DataFrame DataFrame instance created from remote partitions. Notes ----- Pass `index`, `columns`, `row_lengths` and `column_widths` to avoid triggering extra computations of the metadata when creating a DataFrame. """ from modin.data_management.factories.dispatcher import FactoryDispatcher factory = FactoryDispatcher.get_factory() partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class partition_frame_class = factory.io_cls.frame_cls partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place # passed partitions to 2D NumPy array to pass it to internal Modin Frame class. # `axis=None` - convert 2D list to 2D NumPy array if axis is None: if isinstance(partitions[0][0], tuple): parts = np.array( [[partition_class(partition, ip=ip) for ip, partition in row] for row in partitions]) else: parts = np.array( [[partition_class(partition) for partition in row] for row in partitions]) # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition. elif axis == 0: if isinstance(partitions[0], tuple): parts = np.array([[partition_class(partition, ip=ip)] for ip, partition in partitions]) else: parts = np.array([[partition_class(partition)] for partition in partitions]) # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition. elif axis == 1: if isinstance(partitions[0], tuple): parts = np.array([[ partition_class(partition, ip=ip) for ip, partition in partitions ]]) else: parts = np.array( [[partition_class(partition) for partition in partitions]]) else: raise ValueError( f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}." ) labels_axis_to_sync = None if index is None: labels_axis_to_sync = 1 index = partition_mgr_class.get_indices(0, parts, lambda df: df.axes[0]) if columns is None: labels_axis_to_sync = 0 if labels_axis_to_sync is None else -1 columns = partition_mgr_class.get_indices(1, parts, lambda df: df.axes[1]) frame = partition_frame_class( parts, index, columns, row_lengths=row_lengths, column_widths=column_widths, ) if labels_axis_to_sync != -1: frame.synchronize_labels(axis=labels_axis_to_sync) return DataFrame(query_compiler=PandasQueryCompiler(frame))
def test_set_backends(): set_backends("Bar", "Foo") assert FactoryDispatcher.get_factory() == FooOnBarFactory
def test_default_factory(): assert issubclass(FactoryDispatcher.get_factory(), factories.BaseFactory) assert FactoryDispatcher.get_factory().io_cls
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions from modin.config import Engine, NPartitions from modin.pandas.test.utils import df_equals from modin.pandas.indexing import compute_sliced_len from modin.data_management.factories.dispatcher import FactoryDispatcher PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls. _partition_mgr_cls._partition_class) if Engine.get() == "Ray": import ray put_func = ray.put get_func = ray.get FutureType = ray.ObjectRef elif Engine.get() == "Dask": from distributed.client import default_client from distributed import Future put_func = lambda x: default_client().scatter(x) # noqa: E731 get_func = lambda x: x.result() # noqa: E731 FutureType = Future elif Engine.get() == "Python":
def from_partitions(partitions, axis): """ Create DataFrame from remote partitions. Parameters ---------- partitions : list A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used. Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``). axis : {None, 0 or 1} The ``axis`` parameter is used to identify what are the partitions passed. You have to set: * ``axis=0`` if you want to create DataFrame from row partitions * ``axis=1`` if you want to create DataFrame from column partitions * ``axis=None`` if you want to create DataFrame from 2D list of partitions Returns ------- modin.pandas.DataFrame DataFrame instance created from remote partitions. """ from modin.data_management.factories.dispatcher import FactoryDispatcher factory = FactoryDispatcher.get_factory() partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class partition_frame_class = factory.io_cls.frame_cls partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place # passed partitions to 2D NumPy array to pass it to internal Modin Frame class. # `axis=None` - convert 2D list to 2D NumPy array if axis is None: if isinstance(partitions[0][0], tuple): parts = np.array( [ [partition_class(partition, ip=ip) for ip, partition in row] for row in partitions ] ) else: parts = np.array( [ [partition_class(partition) for partition in row] for row in partitions ] ) # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition. elif axis == 0: if isinstance(partitions[0], tuple): parts = np.array( [[partition_class(partition, ip=ip)] for ip, partition in partitions] ) else: parts = np.array([[partition_class(partition)] for partition in partitions]) # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition. elif axis == 1: if isinstance(partitions[0], tuple): parts = np.array( [[partition_class(partition, ip=ip) for ip, partition in partitions]] ) else: parts = np.array([[partition_class(partition) for partition in partitions]]) else: raise ValueError( f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}." ) index = partition_mgr_class.get_indices(0, parts, lambda df: df.axes[0]) columns = partition_mgr_class.get_indices(1, parts, lambda df: df.axes[1]) return DataFrame( query_compiler=PandasQueryCompiler(partition_frame_class(parts, index, columns)) )