def call_deploy(cls, fname, col_partitions, **kwargs): """ Deploy remote tasks to the workers with passed parameters. Parameters ---------- fname : str, path object or file-like object Name of the file to read. col_partitions : list List of arrays with columns names that should be read by each partition. **kwargs : dict Parameters of deploying read_* function. Returns ------- np.ndarray Array with references to the task deploy result for each partition. """ return np.array( [ cls.deploy( cls.parse, NPartitions.get() + 2, dict( fname=fname, columns=cols, num_splits=NPartitions.get(), **kwargs, ), ) for cols in col_partitions ] ).T
def call_deploy(cls, fname, col_partitions, **kwargs): return np.array([ cls.deploy( cls.parse, NPartitions.get() + 2, dict( fname=fname, columns=cols, num_splits=NPartitions.get(), **kwargs, ), ) for cols in col_partitions ]).T
def binary_operation(cls, axis, left, func, right): """ Apply a function that requires two PandasDataframe objects. Parameters ---------- axis : {0, 1} The axis to apply the function over (0 - rows, 1 - columns). left : np.ndarray The partitions of left PandasDataframe. func : callable The function to apply. right : np.ndarray The partitions of right PandasDataframe. Returns ------- np.ndarray A NumPy array with new partitions. """ if axis: left_partitions = cls.row_partitions(left) right_partitions = cls.row_partitions(right) else: left_partitions = cls.column_partitions(left) right_partitions = cls.column_partitions(right) func = cls.preprocess_func(func) result = np.array([ left_partitions[i].apply( func, num_splits=NPartitions.get(), other_axis_partition=right_partitions[i], ) for i in range(len(left_partitions)) ]) return result if axis else result.T
def build_index(cls, partition_ids): """ Compute index and its split sizes of resulting Modin DataFrame. Parameters ---------- partition_ids : list Array with references to the partitions data. Returns ------- index : pandas.Index Index of resulting Modin DataFrame. row_lengths : list List with lengths of index chunks. """ num_partitions = NPartitions.get() index_len = (0 if len(partition_ids) == 0 else cls.materialize( partition_ids[-2][0])) if isinstance(index_len, int): index = pandas.RangeIndex(index_len) else: index = index_len index_len = len(index) index_chunksize = compute_chunksize(index_len, num_partitions) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)] else: row_lengths = [ index_chunksize if i != num_partitions - 1 else index_len - (index_chunksize * (num_partitions - 1)) for i in range(num_partitions) ] return index, row_lengths
def binary_operation(cls, axis, left, func, right): """ Apply a function that requires two BasePandasFrame objects. Parameters ---------- axis : int The axis to apply the function over (0 - rows, 1 - columns) left : NumPy array The partitions of left Modin Frame func : callable The function to apply right : NumPy array The partitions of right Modin Frame. Returns ------- NumPy array A new BasePandasFrame object, the type of object that called this. """ if axis: left_partitions = cls.row_partitions(left) right_partitions = cls.row_partitions(right) else: left_partitions = cls.column_partitions(left) right_partitions = cls.column_partitions(right) func = cls.preprocess_func(func) result = np.array([ left_partitions[i].apply( func, num_splits=NPartitions.get(), other_axis_partition=right_partitions[i], ) for i in range(len(left_partitions)) ]) return result if axis else result.T
def build_columns(cls, columns): """ Split columns into chunks, that should be read be workers. Parameters ---------- columns : list List of columns that should be read from file. Returns ------- col_partitions : list List of lists with columns for reading by workers. column_widths : list List with lengths of `col_partitions` subarrays (number of columns that should be read by workers). """ num_partitions = NPartitions.get() column_splits = ( len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1 ) col_partitions = [ columns[i : i + column_splits] for i in range(0, len(columns), column_splits) ] column_widths = [len(c) for c in col_partitions] return col_partitions, column_widths
def from_pandas(cls, df, return_dims=False): """Return the partitions from Pandas DataFrame.""" def update_bar(pbar, f): if ProgressBar.get(): pbar.update(1) return f num_splits = NPartitions.get() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) bar_format = ( "{l_bar}{bar}{r_bar}" if os.environ.get("DEBUG_PROGRESS_BAR", "False") == "True" else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}" ) if ProgressBar.get(): with warnings.catch_warnings(): warnings.simplefilter("ignore") try: from tqdm.autonotebook import tqdm as tqdm_notebook except ImportError: raise ImportError( "Please pip install tqdm to use the progress bar") rows = max(1, round(len(df) / row_chunksize)) cols = max(1, round(len(df.columns) / col_chunksize)) update_count = rows * cols pbar = tqdm_notebook( total=round(update_count), desc="Distributing Dataframe", bar_format=bar_format, ) else: pbar = None parts = [[ update_bar( pbar, put_func(df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy()), ) for j in range(0, len(df.columns), col_chunksize) ] for i in range(0, len(df), row_chunksize)] if ProgressBar.get(): pbar.close() if not return_dims: return np.array(parts) else: row_lengths = [ row_chunksize if i + row_chunksize < len(df) else len(df) % row_chunksize or row_chunksize for i in range(0, len(df), row_chunksize) ] col_widths = [ col_chunksize if i + col_chunksize < len(df.columns) else len(df.columns) % col_chunksize or col_chunksize for i in range(0, len(df.columns), col_chunksize) ] return np.array(parts), row_lengths, col_widths
def _read(cls, filepath_or_buffer, **kwargs): """ In experimental mode, we can use `*` in the filename. Note: the number of partitions is equal to the number of input files. """ if not (isinstance(filepath_or_buffer, str) and "*" in filepath_or_buffer): warnings.warn("Defaulting to Modin core implementation") return cls.single_worker_read( filepath_or_buffer, single_worker_read=True, **kwargs, ) filepath_or_buffer = sorted(glob.glob(filepath_or_buffer)) if len(filepath_or_buffer) == 0: raise ValueError( f"There are no files matching the pattern: {filepath_or_buffer}" ) partition_ids = [] lengths_ids = [] widths_ids = [] if len(filepath_or_buffer) != NPartitions.get(): # do we need to do a repartitioning? warnings.warn("can be inefficient partitioning") for file_name in filepath_or_buffer: partition_id = cls.deploy( cls.parse, 3, dict( fname=file_name, **kwargs, ), ) partition_ids.append(partition_id[:-2]) lengths_ids.append(partition_id[-2]) widths_ids.append(partition_id[-1]) lengths = cls.materialize(lengths_ids) widths = cls.materialize(widths_ids) # while num_splits is 1, need only one value partition_ids = cls.build_partition(partition_ids, lengths, [widths[0]]) new_index = cls.frame_cls._partition_mgr_cls.get_indices( 0, partition_ids, lambda df: df.axes[0] ) new_columns = cls.frame_cls._partition_mgr_cls.get_indices( 1, partition_ids, lambda df: df.axes[1] ) return cls.query_compiler_cls( cls.frame_cls(partition_ids, new_index, new_columns) )
def test_explode_all_partitions(column, ignore_index): # Test explode with enough rows to fill all partitions. explode should # expand every row in the input data into two rows. It's especially # important that the input data has list-like elements that must be # expanded at the boundaries of the partitions, e.g. at row 31. num_rows = NPartitions.get() * MinPartitionSize.get() data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows} eval_general( *create_test_dfs(data), lambda df: df.explode(column, ignore_index=ignore_index), )
def build_columns(cls, columns): num_partitions = NPartitions.get() column_splits = (len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1) col_partitions = [ columns[i:i + column_splits] for i in range(0, len(columns), column_splits) ] column_widths = [len(c) for c in col_partitions] return col_partitions, column_widths
def _define_metadata( cls, df: pandas.DataFrame, column_names: ColumnNamesTypes, ) -> Tuple[list, int]: """ Define partitioning metadata. Parameters ---------- df : pandas.DataFrame The DataFrame to split. column_names : ColumnNamesTypes Column names of df. Returns ------- column_widths : list Column width to use during new frame creation (number of columns for each partition). num_splits : int The maximum number of splits to separate the DataFrame into. """ # This is the number of splits for the columns num_splits = min(len(column_names) or 1, NPartitions.get()) column_chunksize = compute_chunksize(df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: # split columns into chunks with maximal size column_chunksize, for example # if num_splits == 4, len(column_names) == 80 and column_chunksize == 32, # column_widths will be [32, 32, 16, 0] column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] return column_widths, num_splits
def build_index(cls, partition_ids): num_partitions = NPartitions.get() index_len = cls.materialize(partition_ids[-2][0]) if isinstance(index_len, int): index = pandas.RangeIndex(index_len) else: index = index_len index_len = len(index) index_chunksize = compute_chunksize(pandas.DataFrame(index=index), num_partitions, axis=0) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)] else: row_lengths = [ index_chunksize if i != num_partitions - 1 else index_len - (index_chunksize * (num_partitions - 1)) for i in range(num_partitions) ] return index, row_lengths
def from_pandas(cls, df, return_dims=False): """Return the partitions from Pandas DataFrame.""" num_splits = NPartitions.get() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) parts = [[ put_func(df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy()) for j in range(0, len(df.columns), col_chunksize) ] for i in range(0, len(df), row_chunksize)] if not return_dims: return np.array(parts) else: row_lengths = [ row_chunksize if i + row_chunksize < len(df) else len(df) % row_chunksize or row_chunksize for i in range(0, len(df), row_chunksize) ] col_widths = [ col_chunksize if i + col_chunksize < len(df.columns) else len(df.columns) % col_chunksize or col_chunksize for i in range(0, len(df.columns), col_chunksize) ] return np.array(parts), row_lengths, col_widths
def read(cls, filepath_or_buffer, **kwargs): filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, str): if not cls.file_exists(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) compression_type = cls.infer_compression( filepath_or_buffer, kwargs.get("compression", "infer")) if compression_type is not None: if (compression_type == "gzip" or compression_type == "bz2" or compression_type == "xz"): kwargs["compression"] = compression_type elif (compression_type == "zip" and sys.version_info[0] == 3 and sys.version_info[1] >= 7): # need python3.7 to .seek and .tell ZipExtFile kwargs["compression"] = compression_type else: return cls.single_worker_read(filepath_or_buffer, **kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: return cls.single_worker_read(filepath_or_buffer, **kwargs) # If infer_nrows is a significant portion of the number of rows, pandas may be # faster. infer_nrows = kwargs.get("infer_nrows", 100) if infer_nrows > 100: return cls.single_worker_read(filepath_or_buffer, **kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) nrows = kwargs.pop("nrows", None) names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly names = pandas.read_fwf( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None), ).columns empty_pd_df = pandas.read_fwf(filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0)) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols = kwargs.get("usecols", None) usecols_md = _validate_usecols_arg(usecols) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_fwf( cls.file_open(filepath_or_buffer, "rb"), **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) encoding = kwargs.get("encoding", None) quotechar = kwargs.get( "quotechar", '"').encode(encoding if encoding is not None else "UTF-8") is_quoting = kwargs.get("quoting", "") != QUOTE_NONE with cls.file_open(filepath_or_buffer, "rb", compression_type) as f: # Skip the header since we already have the header information and skip the # rows we are told to skip. if isinstance(skiprows, int) or skiprows is None: if skiprows is None: skiprows = 0 header = kwargs.get("header", "infer") if header == "infer" and kwargs.get("names", None) is None: skiprows += 1 elif isinstance(header, int): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] # Max number of partitions available num_partitions = NPartitions.get() # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] args = { "fname": filepath_or_buffer, "num_splits": num_splits, **partition_kwargs, } splits = cls.partitioned_file( f, num_partitions=num_partitions, nrows=nrows, skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) for start, end in splits: args.update({"start": start, "end": end}) partition_id = cls.deploy(cls.parse, num_splits + 2, args) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) # pandas has a really weird edge case here. if kwargs.get("names", None) is not None and skiprows > 1: new_index = pandas.RangeIndex(skiprows - 1, new_index.stop + skiprows - 1) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # We have to recompute the column widths if `parse_dates` is set because # we are not guaranteed to have the correct information regarding how many # columns are on each partition. column_widths = None # Check if is list of lists if isinstance(parse_dates, list) and isinstance( parse_dates[0], list): for group in parse_dates: new_col_name = "_".join(group) column_names = column_names.drop(group).insert( 0, new_col_name) # Check if it is a dictionary elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert( 0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:]) if kwargs.get("squeeze", False) and len( new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler
def _read(cls, sql, con, index_col=None, **kwargs): """ Read a SQL query or database table into a query compiler. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, or sqlite3 connection Connection object to database. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). **kwargs : dict Parameters to pass into `pandas.read_sql` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ try: import psycopg2 as pg if isinstance(con, pg.extensions.connection): con = "postgresql+psycopg2://{}:{}@{}{}/{}".format( # Table in DB con.info.user, # <Username>: for DB con.info.password, # Password for DB con.info.host if con.info.host != "/tmp" else "", # @<Hostname> (":" + str(con.info.port)) if con.info.host != "/tmp" else "", # <port> con.info.dbname, # Table in DB ) except ImportError: pass # In the case that we are given a SQLAlchemy Connection or Engine, the objects # are not pickleable. We have to convert it to the URL string and connect from # each of the workers. if not isinstance(con, str): warnings.warn( "To use parallel implementation of `read_sql`, pass the sqlalchemy" "connection string instead of {}.".format(type(con))) return cls.single_worker_read(sql, con=con, index_col=index_col, **kwargs) row_cnt_query = "SELECT COUNT(*) FROM ({}) as foo".format(sql) row_cnt = pandas.read_sql(row_cnt_query, con).squeeze() cols_names_df = pandas.read_sql( "SELECT * FROM ({}) as foo LIMIT 0".format(sql), con, index_col=index_col) cols_names = cols_names_df.columns num_partitions = NPartitions.get() partition_ids = [] index_ids = [] dtype_ids = [] limit = math.ceil(row_cnt / num_partitions) for part in range(num_partitions): offset = part * limit query = "SELECT * FROM ({}) as foo LIMIT {} OFFSET {}".format( sql, limit, offset) partition_id = cls.deploy( cls.parse, num_partitions + 2, dict( num_splits=num_partitions, sql=query, con=con, index_col=index_col, **kwargs, ), ) partition_ids.append( [cls.frame_partition_cls(obj) for obj in partition_id[:-2]]) index_ids.append(partition_id[-2]) dtype_ids.append(partition_ids[-1]) if index_col is None: # sum all lens returned from partitions index_lens = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(index_lens)) else: # concat index returned from partitions index_lst = [ x for part_index in cls.materialize(index_ids) for x in part_index ] new_index = pandas.Index(index_lst).set_names(index_col) new_frame = cls.frame_cls(np.array(partition_ids), new_index, cols_names) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame)
def read_sql( cls, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, partition_column=None, lower_bound=None, upper_bound=None, max_sessions=None, ): """ Read SQL query or database table into a DataFrame. The function extended with `Spark-like parameters <https://spark.apache.org/docs/2.0.0/api/R/read.jdbc.html>`_ such as ``partition_column``, ``lower_bound`` and ``upper_bound``. With these parameters, the user will be able to specify how to partition the imported data. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable or str Connection to database (sqlite3 connections are not supported). index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to ``execute`` method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. parse_dates : list or dict, optional The behavior is as follows: - List of column names to parse as dates. - Dict of `{column_name: format string}` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of `{column_name: arg dict}`, where the arg dict corresponds to the keyword arguments of ``pandas.to_datetime``. Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. partition_column : str, optional Column name used for data partitioning between the workers (MUST be an INTEGER column). lower_bound : int, optional The minimum value to be requested from the `partition_column`. upper_bound : int, optional The maximum value to be requested from the `partition_column`. max_sessions : int, optional The maximum number of simultaneous connections allowed to use. Returns ------- BaseQueryCompiler A new query compiler with imported data for further processing. """ from .sql import is_distributed, get_query_info if not is_distributed(partition_column, lower_bound, upper_bound): warnings.warn("Defaulting to Modin core implementation") return PandasOnRayIO.read_sql( sql, con, index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, ) # starts the distributed alternative cols_names, query = get_query_info(sql, con, partition_column) num_parts = min(NPartitions.get(), max_sessions if max_sessions else 1) num_splits = min(len(cols_names), num_parts) diff = (upper_bound - lower_bound) + 1 min_size = diff // num_parts rest = diff % num_parts partition_ids = [] index_ids = [] end = lower_bound - 1 for part in range(num_parts): if rest: size = min_size + 1 rest -= 1 else: size = min_size start = end + 1 end = start + size - 1 partition_id = _read_sql_with_offset_pandas_on_ray.options( num_returns=num_splits + 1).remote( partition_column, start, end, num_splits, query, con, index_col, coerce_float, params, parse_dates, columns, chunksize, ) partition_ids.append([ PandasOnRayDataframePartition(obj) for obj in partition_id[:-1] ]) index_ids.append(partition_id[-1]) new_index = pandas.RangeIndex(sum(ray.get(index_ids))) new_query_compiler = cls.query_compiler_cls( cls.frame_cls(np.array(partition_ids), new_index, cols_names)) new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler
import os import logging import modin.pandas as pd import pandas import numpy as np import uuid RAND_LOW = 0 RAND_HIGH = 100 random_state = np.random.RandomState(seed=42) try: from modin.config import NPartitions NPARTITIONS = NPartitions.get() except ImportError: NPARTITIONS = pd.DEFAULT_NPARTITIONS try: from modin.config import TestDatasetSize, AsvImplementation ASV_USE_IMPL = AsvImplementation.get() ASV_DATASET_SIZE = TestDatasetSize.get() or "Small" except ImportError: # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin") ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small") assert ASV_USE_IMPL in ("modin", "pandas")
def _read(cls, path_or_buf, **kwargs): """ Read data from `path_or_buf` according to the passed `read_json` `kwargs` parameters. Parameters ---------- path_or_buf : str, path object or file-like object `path_or_buf` parameter of `read_json` function. **kwargs : dict Parameters of `read_json` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ path_or_buf = cls.get_path_or_buffer(path_or_buf) if isinstance(path_or_buf, str): if not cls.file_exists(path_or_buf): return cls.single_worker_read(path_or_buf, **kwargs) path_or_buf = cls.get_path(path_or_buf) elif not cls.pathlib_or_pypath(path_or_buf): return cls.single_worker_read(path_or_buf, **kwargs) if not kwargs.get("lines", False): return cls.single_worker_read(path_or_buf, **kwargs) with OpenFile(path_or_buf, "rb") as f: columns = pandas.read_json(BytesIO(b"" + f.readline()), lines=True).columns kwargs["columns"] = columns empty_pd_df = pandas.DataFrame(columns=columns) with OpenFile(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: partition_ids = [] index_ids = [] dtypes_ids = [] column_widths, num_splits = cls._define_metadata( empty_pd_df, columns) args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} splits = cls.partitioned_file( f, num_partitions=NPartitions.get(), ) for start, end in splits: args.update({"start": start, "end": end}) partition_id = cls.deploy(cls.parse, num_returns=num_splits + 3, **args) partition_ids.append(partition_id[:-3]) index_ids.append(partition_id[-3]) dtypes_ids.append(partition_id[-2]) # partition_id[-1] contains the columns for each partition, which will be useful # for implementing when `lines=False`. row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) dtypes = cls.get_dtypes(dtypes_ids) partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) if isinstance(dtypes, pandas.Series): dtypes.index = columns else: dtypes = pandas.Series(dtypes, index=columns) new_frame = cls.frame_cls( np.array(partition_ids), new_index, columns, row_lengths, column_widths, dtypes=dtypes, ) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame)
import numpy as np import pandas from .utils import generate_dataframe, RAND_LOW, RAND_HIGH, random_string from modin.config import NPartitions try: from modin.config import TestDatasetSize, AsvImplementation ASV_USE_IMPL = AsvImplementation.get() ASV_DATASET_SIZE = TestDatasetSize.get() except ImportError: # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = "modin" ASV_DATASET_SIZE = "Big" if NPartitions.get() >= 32 else "Small" if ASV_DATASET_SIZE == "Big": BINARY_OP_DATA_SIZE = [ (5000, 5000, 5000, 5000), # the case extremely inefficient # (20, 500_000, 10, 1_000_000), (500_000, 20, 1_000_000, 10), ] UNARY_OP_DATA_SIZE = [ (5000, 5000), # the case extremely inefficient # (10, 1_000_000), (1_000_000, 10), ] else:
def _read(cls, io, **kwargs): if (kwargs.get("engine", None) is not None and kwargs.get("engine") != "openpyxl"): warnings.warn( "Modin only implements parallel `read_excel` with `openpyxl` engine, " 'please specify `engine=None` or `engine="openpyxl"` to ' "use Modin's parallel implementation.") return cls.single_worker_read(io, **kwargs) if sys.version_info < (3, 7): warnings.warn( "Python 3.7 or higher required for parallel `read_excel`.") return cls.single_worker_read(io, **kwargs) from zipfile import ZipFile from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from modin.backends.pandas.parsers import PandasExcelParser sheet_name = kwargs.get("sheet_name", 0) if sheet_name is None or isinstance(sheet_name, list): warnings.warn( "`read_excel` functionality is only implemented for a single sheet at a " "time. Multiple sheet reading coming soon!") return cls.single_worker_read(io, **kwargs) warnings.warn("Parallel `read_excel` is a new feature! Please email " "[email protected] if you run into any problems.") # NOTE: ExcelReader() in read-only mode does not close file handle by itself # work around that by passing file object if we received some path io_file = open(io, "rb") if isinstance(io, str) else io try: ex = ExcelReader(io_file, read_only=True) ex.read() wb = ex.wb # Get shared strings ex.read_manifest() ex.read_strings() ws = Worksheet(wb) finally: if isinstance(io, str): # close only if it were us who opened the object io_file.close() pandas_kw = dict(kwargs) # preserve original kwargs with ZipFile(io) as z: from io import BytesIO # Convert index to sheet name in file if isinstance(sheet_name, int): sheet_name = "sheet{}".format(sheet_name + 1) else: sheet_name = "sheet{}".format( wb.sheetnames.index(sheet_name) + 1) if any(sheet_name.lower() in name for name in z.namelist()): sheet_name = sheet_name.lower() elif any(sheet_name.title() in name for name in z.namelist()): sheet_name = sheet_name.title() else: raise ValueError("Sheet {} not found".format( sheet_name.lower())) # Pass this value to the workers kwargs["sheet_name"] = sheet_name f = z.open("xl/worksheets/{}.xml".format(sheet_name)) f = BytesIO(f.read()) total_bytes = cls.file_size(f) num_partitions = NPartitions.get() # Read some bytes from the sheet so we can extract the XML header and first # line. We need to make sure we get the first line of the data as well # because that is where the column names are. The header information will # be extracted and sent to all of the nodes. sheet_block = f.read(EXCEL_READ_BLOCK_SIZE) end_of_row_tag = b"</row>" while end_of_row_tag not in sheet_block: sheet_block += f.read(EXCEL_READ_BLOCK_SIZE) idx_of_header_end = sheet_block.index(end_of_row_tag) + len( end_of_row_tag) sheet_header = sheet_block[:idx_of_header_end] # Reset the file pointer to begin at the end of the header information. f.seek(idx_of_header_end) kwargs["_header"] = sheet_header footer = b"</sheetData></worksheet>" # Use openpyxml to parse the data reader = WorksheetReader(ws, BytesIO(sheet_header + footer), ex.shared_strings, False) # Attach cells to the worksheet reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.get("convert_float", True)) # Extract column names from parsed data. column_names = pandas.Index(data[0]) index_col = kwargs.get("index_col", None) # Remove column names that are specified as `index_col` if index_col is not None: column_names = column_names.drop(column_names[index_col]) if not all(column_names): # some column names are empty, use pandas reader to take the names from it pandas_kw["nrows"] = 1 df = pandas.read_excel(io, **pandas_kw) column_names = df.columns # Compute partition metadata upfront so it is uniform for all partitions chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) num_splits = min(len(column_names), num_partitions) kwargs["fname"] = io # Skiprows will be used to inform a partition how many rows come before it. kwargs["skiprows"] = 0 rows_to_skip = 0 data_ids = [] index_ids = [] dtypes_ids = [] # Compute column metadata column_chunksize = compute_chunksize( pandas.DataFrame(columns=column_names), num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] kwargs["num_splits"] = num_splits while f.tell() < total_bytes: args = kwargs args["skiprows"] = rows_to_skip args["start"] = f.tell() chunk = f.read(chunk_size) # This edge case can happen when we have reached the end of the data # but not the end of the file. if b"<row" not in chunk: break row_close_tag = b"</row>" row_count = re.subn(row_close_tag, b"", chunk)[1] # Make sure we are reading at least one row. while row_count == 0: chunk += f.read(chunk_size) row_count += re.subn(row_close_tag, b"", chunk)[1] last_index = chunk.rindex(row_close_tag) f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1) args["end"] = f.tell() # If there is no data, exit before triggering computation. if b"</row>" not in chunk and b"</sheetData>" in chunk: break # We need to make sure we include all rows, even those that have no # data. Getting the number of the last row will turn into the number of # skipped rows, so if there are any rows missing between the last row # seen here and the first row the next partition reads, the parser will # have to include those rows in that specific partition to match the # expected behavior. We subtract 1 here because the header is included # in the skip values, and we do not want to skip the header. rows_to_skip = (int(chunk[:last_index + len(row_close_tag)]. split(b'<row r="')[-1].split(b'"')[0]) - 1) remote_results_list = cls.deploy(cls.parse, num_splits + 2, args) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) dtypes_ids.append(remote_results_list[-1]) # The end of the spreadsheet if b"</sheetData>" in chunk: break # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) data_ids = cls.build_partition(data_ids, row_lengths, column_widths) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( data_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler
def read_sql( cls, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, partition_column=None, lower_bound=None, upper_bound=None, max_sessions=None, ): """Read SQL query or database table into a DataFrame. Args: sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode) index_col: Column(s) to set as index(MultiIndex). coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params: List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. parse_dates: - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. columns: List of column names to select from SQL table (only used when reading a table). chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. partition_column: column used to share the data between the workers (MUST be a INTEGER column) lower_bound: the minimum value to be requested from the partition_column upper_bound: the maximum value to be requested from the partition_column max_sessions: the maximum number of simultaneous connections allowed to use Returns: Pandas Dataframe """ from .sql import is_distributed, get_query_info if not is_distributed(partition_column, lower_bound, upper_bound): warnings.warn("Defaulting to Modin core implementation") return PandasOnRayIO.read_sql( sql, con, index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, ) # starts the distributed alternative cols_names, query = get_query_info(sql, con, partition_column) num_parts = min(NPartitions.get(), max_sessions) num_splits = min(len(cols_names), num_parts) diff = (upper_bound - lower_bound) + 1 min_size = diff // num_parts rest = diff % num_parts partition_ids = [] index_ids = [] end = lower_bound - 1 for part in range(num_parts): if rest: size = min_size + 1 rest -= 1 else: size = min_size start = end + 1 end = start + size - 1 partition_id = _read_sql_with_offset_pandas_on_ray._remote( args=( partition_column, start, end, num_splits, query, con, index_col, coerce_float, params, parse_dates, columns, chunksize, ), num_returns=num_splits + 1, ) partition_ids.append( [PandasOnRayFramePartition(obj) for obj in partition_id[:-1]]) index_ids.append(partition_id[-1]) new_index = pandas.RangeIndex(sum(ray.get(index_ids))) return cls.query_compiler_cls( cls.frame_cls(np.array(partition_ids), new_index, cols_names))
def partitioned_file( cls, f, num_partitions: int = None, nrows: int = None, skiprows: int = None, quotechar: bytes = b'"', is_quoting: bool = True, ): """ Compute chunk sizes in bytes for every partition. Parameters ---------- f: file to be partitioned num_partitions: int, optional For what number of partitions split a file. If not specified grabs the value from `modin.config.NPartitions.get()` nrows: int, optional Number of rows of file to read. skiprows: array or callable, optional Specifies rows to skip. quotechar: bytes, default b'"' Indicate quote in a file. is_quoting: bool, default True Whether or not to consider quotes. Returns ------- An array, where each element of array is a tuple of two ints: beginning and the end offsets of the current chunk. """ if num_partitions is None: num_partitions = NPartitions.get() rows_skipper = cls.rows_skipper_builder(f, quotechar, is_quoting=is_quoting) result = [] file_size = cls.file_size(f) rows_skipper(skiprows) start = f.tell() if nrows: read_rows_counter = 0 partition_size = max(1, num_partitions, nrows // num_partitions) while f.tell() < file_size and read_rows_counter < nrows: if read_rows_counter + partition_size > nrows: # it's possible only if is_quoting==True partition_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( f, nrows=partition_size, quotechar=quotechar, is_quoting=is_quoting, ) result.append((start, f.tell())) start = f.tell() read_rows_counter += read_rows # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") else: partition_size = max(1, num_partitions, file_size // num_partitions) while f.tell() < file_size: outside_quotes = cls.offset( f, offset_size=partition_size, quotechar=quotechar, is_quoting=is_quoting, ) result.append((start, f.tell())) start = f.tell() # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") return result
def _read(cls, filepath_or_buffer, **kwargs): filepath_or_buffer_md = (cls.get_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) else cls.get_path_or_buffer(filepath_or_buffer)) compression_infered = cls.infer_compression(filepath_or_buffer, kwargs.get("compression")) use_modin_impl = cls._read_csv_check_support(filepath_or_buffer, kwargs, compression_infered) if not use_modin_impl: return cls.single_worker_read(filepath_or_buffer, **kwargs) # Getting frequently used read_csv kwargs names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) encoding = kwargs.get("encoding", None) skiprows = kwargs.get("skiprows") is_quoting = kwargs.get("quoting", "") != QUOTE_NONE quotechar = kwargs.get( "quotechar", '"').encode(encoding if encoding is not None else "UTF-8") # Define header size for further skipping (Header can be skipped because header # information will be obtained further from empty_df, so no need to handle it # by workers) header_size = cls._define_header_size( kwargs.get("header", "infer"), names, ) # Since skiprows can be only integer here (non-integer skiprows trigger fallback # to pandas implementation for now) we can process header_size and skiprows # simultaneously skiprows = skiprows + header_size if skiprows else header_size # Now we need to define parameters, which are common for all partitions. These # parameters can be `sniffed` from empty dataframes created further if names is None: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly names = pandas.read_csv( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None), ).columns elif index_col is None and not kwargs.get("usecols", None): # When names is set to some list that is smaller than the number of columns # in the file, the first columns are built as a hierarchical index. empty_pd_df = pandas.read_csv(filepath_or_buffer, nrows=0, encoding=encoding) num_cols = len(empty_pd_df.columns) if num_cols > len(names): index_col = list(range(num_cols - len(names))) if len(index_col) == 1: index_col = index_col[0] empty_pd_df = pandas.read_csv( filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0, index_col=index_col), ) column_names = empty_pd_df.columns # Max number of partitions available num_partitions = NPartitions.get() # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # Metadata definition column_widths, num_splits = cls._define_metadata( empty_pd_df, num_splits, column_names) # kwargs that will be passed to the workers partition_kwargs = dict( kwargs, fname=filepath_or_buffer_md, num_splits=num_splits, header=None, names=names, skipfooter=0, skiprows=1 if encoding is not None else None, nrows=None, compression=compression_infered, index_col=index_col, ) with cls.file_open(filepath_or_buffer_md, "rb", compression_infered) as f: splits = cls.partitioned_file( f, num_partitions=num_partitions, nrows=kwargs.get("nrows", None), skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) partition_ids, index_ids, dtypes_ids = cls._launch_tasks( splits, **partition_kwargs) new_query_compiler = cls._get_new_qc( partition_ids=partition_ids, index_ids=index_ids, dtypes_ids=dtypes_ids, index_col_md=index_col, index_name=empty_pd_df.index.name, column_widths=column_widths, column_names=column_names, squeeze=kwargs.get("squeeze", False), skipfooter=kwargs.get("skipfooter", None), parse_dates=kwargs.get("parse_dates", False), ) return new_query_compiler
def _read(cls, filepath_or_buffer, columns, custom_parser, **kwargs): r""" Read data from `filepath_or_buffer` according to the passed `read_custom_text` `kwargs` parameters. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_custom_text` function. columns : list or callable(file-like object, \*\*kwargs -> list Column names of list type or callable that create column names from opened file and passed `kwargs`. custom_parser : callable(file-like object, \*\*kwargs -> pandas.DataFrame Function that takes as input a part of the `filepath_or_buffer` file loaded into memory in file-like object form. **kwargs : dict Parameters of `read_custom_text` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ filepath_or_buffer_md = ( cls.get_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) else cls.get_path_or_buffer(filepath_or_buffer) ) compression_infered = cls.infer_compression( filepath_or_buffer, kwargs["compression"] ) with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f: splits = cls.partitioned_file( f, num_partitions=NPartitions.get(), is_quoting=kwargs.pop("is_quoting"), nrows=kwargs["nrows"], ) if callable(columns): with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f: columns = columns(f, **kwargs) if not isinstance(columns, pandas.Index): columns = pandas.Index(columns) empty_pd_df = pandas.DataFrame(columns=columns) index_name = empty_pd_df.index.name column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) # kwargs that will be passed to the workers partition_kwargs = dict( kwargs, fname=filepath_or_buffer_md, num_splits=num_splits, nrows=None, compression=compression_infered, ) partition_ids, index_ids, dtypes_ids = cls._launch_tasks( splits, callback=custom_parser, **partition_kwargs ) new_query_compiler = cls._get_new_qc( partition_ids=partition_ids, index_ids=index_ids, dtypes_ids=dtypes_ids, index_col=None, index_name=index_name, column_widths=column_widths, column_names=columns, nrows=kwargs["nrows"], ) return new_query_compiler
def partitioned_file( cls, files, fnames: List[str], num_partitions: int = None, nrows: int = None, skiprows: int = None, skip_header: int = None, quotechar: bytes = b'"', is_quoting: bool = True, ) -> List[List[Tuple[str, int, int]]]: """ Compute chunk sizes in bytes for every partition. Parameters ---------- files : file or list of files File(s) to be partitioned. fnames : str or list of str File name(s) to be partitioned. num_partitions : int, optional For what number of partitions split a file. If not specified grabs the value from `modin.config.NPartitions.get()`. nrows : int, optional Number of rows of file to read. skiprows : int, optional Specifies rows to skip. skip_header : int, optional Specifies header rows to skip. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. Returns ------- list List, where each element of the list is a list of tuples. The inner lists of tuples contains the data file name of the chunk, chunk start offset, and chunk end offsets for its corresponding file. Notes ----- The logic gets really complicated if we try to use the `TextFileDispatcher.partitioned_file`. """ if type(files) != list: files = [files] if num_partitions is None: num_partitions = NPartitions.get() file_sizes = [cls.file_size(f) for f in files] partition_size = max( 1, num_partitions, (nrows if nrows else sum(file_sizes)) // num_partitions ) result = [] split_result = [] split_size = 0 read_rows_counter = 0 for f, fname, f_size in zip(files, fnames, file_sizes): if skiprows or skip_header: skip_amount = (skiprows if skiprows else 0) + ( skip_header if skip_header else 0 ) # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df. outside_quotes, read_rows = cls._read_rows( f, nrows=skip_amount, quotechar=quotechar, is_quoting=is_quoting, ) if skiprows: skiprows -= read_rows if skiprows > 0: # We have more rows to skip than the amount read in the file. continue start = f.tell() while f.tell() < f_size: if split_size >= partition_size: # Create a new split when the split has reached partition_size. # This is mainly used when we are reading row-wise partitioned files. result.append(split_result) split_result = [] split_size = 0 # We calculate the amount that we need to read based off of how much of the split we have already read. read_size = partition_size - split_size if nrows: if read_rows_counter >= nrows: # # Finish when we have read enough rows. if len(split_result) > 0: # Add last split into the result. result.append(split_result) return result elif read_rows_counter + read_size > nrows: # Ensure that we will not read more than nrows. read_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( f, nrows=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_size += read_rows read_rows_counter += read_rows else: outside_quotes = cls.offset( f, offset_size=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_result.append((fname, start, f.tell())) split_size += f.tell() - start start = f.tell() # Add outside_quotes. if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") # Add last split into the result. if len(split_result) > 0: result.append(split_result) return result
def partitioned_file( cls, f, num_partitions: int = None, nrows: int = None, skiprows: int = None, quotechar: bytes = b'"', is_quoting: bool = True, encoding: str = None, newline: bytes = None, header_size: int = 0, pre_reading: int = 0, ): """ Compute chunk sizes in bytes for every partition. Parameters ---------- f : file-like object File handle of file to be partitioned. num_partitions : int, optional For what number of partitions split a file. If not specified grabs the value from `modin.config.NPartitions.get()`. nrows : int, optional Number of rows of file to read. skiprows : int, optional Specifies rows to skip. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. encoding : str, optional Encoding of `f`. newline : bytes, optional Byte or sequence of bytes indicating line endings. header_size : int, default: 0 Number of rows, that occupied by header. pre_reading : int, default: 0 Number of rows between header and skipped rows, that should be read. Returns ------- list List with the next elements: int : partition start read byte int : partition end read byte """ read_rows_counter = 0 outside_quotes = True if num_partitions is None: num_partitions = NPartitions.get( ) - 1 if pre_reading else NPartitions.get() rows_skipper = cls.rows_skipper_builder(f, quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline) result = [] file_size = cls.file_size(f) rows_skipper(header_size) if pre_reading: pre_reading_start = f.tell() outside_quotes, read_rows = cls._read_rows( f, nrows=pre_reading, quotechar=quotechar, is_quoting=is_quoting, outside_quotes=outside_quotes, encoding=encoding, newline=newline, ) read_rows_counter += read_rows result.append((pre_reading_start, f.tell())) # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") rows_skipper(skiprows) start = f.tell() if nrows: partition_size = max(1, num_partitions, nrows // num_partitions) while f.tell() < file_size and read_rows_counter < nrows: if read_rows_counter + partition_size > nrows: # it's possible only if is_quoting==True partition_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( f, nrows=partition_size, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, ) result.append((start, f.tell())) start = f.tell() read_rows_counter += read_rows # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") else: partition_size = max(1, num_partitions, file_size // num_partitions) while f.tell() < file_size: outside_quotes = cls.offset( f, offset_size=partition_size, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, ) result.append((start, f.tell())) start = f.tell() # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") return result
def _read(cls, filepath_or_buffer, **kwargs): """ Read data from multiple `.csv` files passed with `filepath_or_buffer` simultaneously. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of ``read_csv`` function. **kwargs : dict Parameters of ``read_csv`` function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ # Ensures that the file is a string file path. Otherwise, default to pandas. filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, str): # os.altsep == None on Linux is_folder = any( filepath_or_buffer.endswith(sep) for sep in (os.sep, os.altsep) if sep ) if "*" not in filepath_or_buffer and not is_folder: warnings.warn( "Shell-style wildcard '*' must be in the filename pattern in order to read multiple " + f"files at once. Did you forget it? Passed filename: '{filepath_or_buffer}'" ) if not cls.file_exists(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read(filepath_or_buffer, **kwargs) # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the # first file but with different data values. glob_filepaths = filepath_or_buffer filepath_or_buffer = filepath_or_buffer[0] compression_type = cls.infer_compression( filepath_or_buffer, kwargs.get("compression") ) if compression_type is not None: if ( compression_type == "gzip" or compression_type == "bz2" or compression_type == "xz" ): kwargs["compression"] = compression_type elif ( compression_type == "zip" and sys.version_info[0] == 3 and sys.version_info[1] >= 7 ): # need python3.7 to .seek and .tell ZipExtFile kwargs["compression"] = compression_type else: return cls.single_worker_read(filepath_or_buffer, **kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: return cls.single_worker_read(filepath_or_buffer, **kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) nrows = kwargs.pop("nrows", None) names = kwargs.get("names", lib.no_default) index_col = kwargs.get("index_col", None) usecols = kwargs.get("usecols", None) encoding = kwargs.get("encoding", None) if names in [lib.no_default, None]: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly. names = pandas.read_csv( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None), ).columns elif index_col is None and not usecols: # When names is set to some list that is smaller than the number of columns # in the file, the first columns are built as a hierarchical index. empty_pd_df = pandas.read_csv( filepath_or_buffer, nrows=0, encoding=encoding ) num_cols = len(empty_pd_df.columns) if num_cols > len(names): index_col = list(range(num_cols - len(names))) if len(index_col) == 1: index_col = index_col[0] kwargs["index_col"] = index_col empty_pd_df = pandas.read_csv( filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0) ) column_names = empty_pd_df.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols_md = cls._validate_usecols_arg(usecols) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv( OpenFile(filepath_or_buffer, "rb"), **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) encoding = kwargs.get("encoding", None) quotechar = kwargs.get("quotechar", '"').encode( encoding if encoding is not None else "UTF-8" ) is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE with ExitStack() as stack: files = [ stack.enter_context(OpenFile(fname, "rb", compression_type)) for fname in glob_filepaths ] # Skip the header since we already have the header information and skip the # rows we are told to skip. if isinstance(skiprows, int) or skiprows is None: if skiprows is None: skiprows = 0 header = kwargs.get("header", "infer") if header == "infer" and kwargs.get("names", lib.no_default) in [ lib.no_default, None, ]: skip_header = 1 elif isinstance(header, int): skip_header = header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skip_header = max(header) + 1 else: skip_header = 0 if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] column_widths, num_splits = cls._define_metadata(empty_pd_df, column_names) args = { "num_splits": num_splits, **partition_kwargs, } splits = cls.partitioned_file( files, glob_filepaths, num_partitions=NPartitions.get(), nrows=nrows, skiprows=skiprows, skip_header=skip_header, quotechar=quotechar, is_quoting=is_quoting, ) for chunks in splits: args.update({"chunks": chunks}) partition_id = cls.deploy(cls.parse, num_returns=num_splits + 2, **args) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = empty_pd_df.index.name # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # If parse_dates is present, the column names that we have might not be # the same length as the returned column names. If we do need to modify # the column names, we remove the old names from the column names and # insert the new one at the front of the Index. if parse_dates is not None: # We have to recompute the column widths if `parse_dates` is set because # we are not guaranteed to have the correct information regarding how many # columns are on each partition. column_widths = None if isinstance(parse_dates, list): for date in parse_dates: # Lists within the parse_dates list are sequences of # CSV columns that are parsed together as a single date # column. They can be a list of either string column names # or integer column indices. e.g. if parse_dates is # [[1, 2]] and columns at indices 1 and 2 are "b" and "c", # the output dataframe has the single date column "b_c". If # parse_dates is [["a", 1]] and the column at index 1 is # named "b", the output dataframe has the single date # column "a_b". if isinstance(date, list): for i, part in enumerate(date): if isinstance(part, int): date[i] = column_names[part] new_col_name = "_".join(date) column_names = column_names.drop(date).insert(0, new_col_name) elif isinstance(parse_dates, dict): for new_col_name, group in parse_dates.items(): column_names = column_names.drop(group).insert(0, new_col_name) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:] ) if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler
def _read(cls, sql, con, index_col=None, **kwargs): """ Read a SQL query or database table into a query compiler. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, sqlite3 connection, or ModinDatabaseConnection Connection object to database. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). **kwargs : dict Parameters to pass into `pandas.read_sql` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ if isinstance(con, str): con = ModinDatabaseConnection("sqlalchemy", con) if not isinstance(con, ModinDatabaseConnection): warnings.warn( "To use parallel implementation of `read_sql`, pass either " + "the SQL connection string or a ModinDatabaseConnection " + "with the arguments required to make a connection, instead " + f"of {type(con)}. For documentation of ModinDatabaseConnection, see " + "https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html#connecting-to-a-database-for-read-sql" ) return cls.single_worker_read( sql, con=con, index_col=index_col, read_sql_engine=ReadSqlEngine.get(), **kwargs, ) row_count_query = con.row_count_query(sql) connection_for_pandas = con.get_connection() colum_names_query = con.column_names_query(sql) row_cnt = pandas.read_sql(row_count_query, connection_for_pandas).squeeze() cols_names_df = pandas.read_sql( colum_names_query, connection_for_pandas, index_col=index_col ) cols_names = cols_names_df.columns num_partitions = NPartitions.get() partition_ids = [] index_ids = [] dtype_ids = [] limit = math.ceil(row_cnt / num_partitions) for part in range(num_partitions): offset = part * limit query = con.partition_query(sql, limit, offset) partition_id = cls.deploy( cls.parse, num_returns=num_partitions + 2, num_splits=num_partitions, sql=query, con=con, index_col=index_col, read_sql_engine=ReadSqlEngine.get(), **kwargs, ) partition_ids.append( [cls.frame_partition_cls(obj) for obj in partition_id[:-2]] ) index_ids.append(partition_id[-2]) dtype_ids.append(partition_ids[-1]) if index_col is None: # sum all lens returned from partitions index_lens = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(index_lens)) else: # concat index returned from partitions index_lst = [ x for part_index in cls.materialize(index_ids) for x in part_index ] new_index = pandas.Index(index_lst).set_names(index_col) new_frame = cls.frame_cls(np.array(partition_ids), new_index, cols_names) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame)
def broadcast_axis_partitions( cls, axis, apply_func, left, right, keep_partitioning=False, apply_indices=None, enumerate_partitions=False, lengths=None, ): """ Broadcast the right partitions to left and apply a function along full axis. Parameters ---------- axis : The axis to apply and broadcast over. apply_func : The function to apply. left : The left partitions. right : The right partitions. keep_partitioning : boolean. Default is False The flag to keep partitions for Modin Frame. apply_indices : list of ints (optional), Indices of `axis ^ 1` to apply function over. enumerate_partitions : bool (optional, default False), Whether or not to pass partition index into `apply_func`. Note that `apply_func` must be able to obtain `partition_idx` kwarg. lengths : list(int), default None The list of lengths to shuffle the object. Returns ------- A new `np.array` of partition objects. """ # Since we are already splitting the DataFrame back up after an # operation, we will just use this time to compute the number of # partitions as best we can right now. if keep_partitioning: num_splits = len(left) if axis == 0 else len(left.T) elif lengths: num_splits = len(lengths) else: num_splits = NPartitions.get() preprocessed_map_func = cls.preprocess_func(apply_func) left_partitions = cls.axis_partition(left, axis) right_partitions = None if right is None else cls.axis_partition( right, axis) # For mapping across the entire axis, we don't maintain partitioning because we # may want to line to partitioning up with another BlockPartitions object. Since # we don't need to maintain the partitioning, this gives us the opportunity to # load-balance the data as well. kw = { "num_splits": num_splits, "other_axis_partition": right_partitions, } if lengths: kw["_lengths"] = lengths kw["manual_partition"] = True if apply_indices is None: apply_indices = np.arange(len(left_partitions)) result_blocks = np.array([ left_partitions[i].apply( preprocessed_map_func, **kw, **({ "partition_idx": idx } if enumerate_partitions else {}), ) for idx, i in enumerate(apply_indices) ]) # If we are mapping over columns, they are returned to use the same as # rows, so we need to transpose the returned 2D NumPy array to return # the structure to the correct order. return result_blocks.T if not axis else result_blocks
def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. Used in `read_csv` and `read_fwf` Modin implementations. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of read functions. **kwargs : dict Parameters of read functions. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ filepath_or_buffer_md = (cls.get_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) else cls.get_path_or_buffer(filepath_or_buffer)) compression_infered = cls.infer_compression(filepath_or_buffer, kwargs["compression"]) # Getting frequently used kwargs; # They should be defined in higher level names = kwargs["names"] index_col = kwargs["index_col"] encoding = kwargs["encoding"] skiprows = kwargs["skiprows"] header = kwargs["header"] # Define header size for further skipping (Header can be skipped because header # information will be obtained further from empty_df, so no need to handle it # by workers) header_size = cls._define_header_size( header, names, ) ( skiprows_md, pre_reading, skiprows_partitioning, ) = cls._manage_skiprows_parameter(skiprows, header_size) should_handle_skiprows = skiprows_md is not None and not isinstance( skiprows_md, int) use_modin_impl = cls.check_parameters_support( filepath_or_buffer, kwargs, ) if not use_modin_impl: return cls.single_worker_read(filepath_or_buffer, callback=cls.read_callback, **kwargs) is_quoting = kwargs["quoting"] != QUOTE_NONE # In these cases we should pass additional metadata # to the workers to match pandas output pass_names = names in [ None, lib.no_default ] and (skiprows is not None or kwargs["skipfooter"] != 0) pd_df_metadata = cls.read_callback( filepath_or_buffer, **dict(kwargs, nrows=1, skipfooter=0, index_col=index_col), ) column_names = pd_df_metadata.columns column_widths, num_splits = cls._define_metadata( pd_df_metadata, column_names) # kwargs that will be passed to the workers partition_kwargs = dict( kwargs, fname=filepath_or_buffer_md, num_splits=num_splits, header_size=header_size if not pass_names else 0, names=names if not pass_names else column_names, header=header if not pass_names else "infer", skipfooter=0, skiprows=None, nrows=None, compression=compression_infered, ) with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f: old_pos = f.tell() fio = io.TextIOWrapper(f, encoding=encoding, newline="") newline, quotechar = cls.compute_newline( fio, encoding, kwargs.get("quotechar", '"')) f.seek(old_pos) splits = cls.partitioned_file( f, num_partitions=NPartitions.get(), nrows=kwargs["nrows"] if not should_handle_skiprows else None, skiprows=skiprows_partitioning, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, header_size=header_size, pre_reading=pre_reading, ) partition_ids, index_ids, dtypes_ids = cls._launch_tasks( splits, callback=cls.read_callback, **partition_kwargs) new_query_compiler = cls._get_new_qc( partition_ids=partition_ids, index_ids=index_ids, dtypes_ids=dtypes_ids, index_col=index_col, index_name=pd_df_metadata.index.name, column_widths=column_widths, column_names=column_names, skiprows_md=skiprows_md if should_handle_skiprows else None, header_size=header_size, skipfooter=kwargs["skipfooter"], parse_dates=kwargs["parse_dates"], nrows=kwargs["nrows"] if should_handle_skiprows else None, ) return new_query_compiler