def read_json( cls, path_or_buf=None, orient=None, typ="frame", dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, encoding_errors="strict", lines=False, chunksize=None, compression="infer", nrows: Optional[int] = None, storage_options=None, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_json`") kwargs = { "path_or_buf": path_or_buf, "orient": orient, "typ": typ, "dtype": dtype, "convert_axes": convert_axes, "convert_dates": convert_dates, "keep_default_dates": keep_default_dates, "numpy": numpy, "precise_float": precise_float, "date_unit": date_unit, "encoding": encoding, "encoding_errors": encoding_errors, "lines": lines, "chunksize": chunksize, "compression": compression, "nrows": nrows, "storage_options": storage_options, } return cls.from_pandas(pandas.read_json(**kwargs))
def read_fwf( cls, filepath_or_buffer, colspecs="infer", widths=None, infer_nrows=100, **kwds ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_fwf`") pd_obj = pandas.read_fwf( filepath_or_buffer, colspecs=colspecs, widths=widths, infer_nrows=infer_nrows, **kwds, ) if isinstance(pd_obj, pandas.DataFrame): return cls.from_pandas(pd_obj) if isinstance(pd_obj, pandas.io.parsers.TextFileReader): # Overwriting the read method should return a Modin DataFrame for calls # to __next__ and get_chunk pd_read = pd_obj.read pd_obj.read = lambda *args, **kwargs: cls.from_pandas( pd_read(*args, **kwargs) ) return pd_obj
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) if kwargs.get("chunksize") is not None: ErrorMessage.default_to_pandas("Parameters provided [chunksize]") df_gen = pandas.read_sql(**kwargs) return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df)) for df in df_gen) return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
def read_gbq(cls, query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=None, private_key=None, dialect="legacy", **kwargs): ErrorMessage.default_to_pandas() return cls.from_pandas( pandas.read_gbq(query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, verbose=verbose, private_key=private_key, dialect=dialect, **kwargs))
def read_sql_query( cls, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, ): ErrorMessage.default_to_pandas("`read_sql_query`") return cls.from_pandas( pandas.read_sql_query( sql, con, index_col=index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, chunksize=chunksize, ))
def merge_asof( left, right, on=None, left_on=None, right_on=None, left_index: bool = False, right_index: bool = False, by=None, left_by=None, right_by=None, suffixes=("_x", "_y"), tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", ) -> DataFrame: if not isinstance(left, DataFrame): raise ValueError( "can not merge DataFrame with instance of type {}".format( type(right))) ErrorMessage.default_to_pandas("`merge_asof`") if isinstance(right, DataFrame): right = to_pandas(right) return DataFrame( pandas.merge_asof( to_pandas(left), right, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, by=by, left_by=left_by, right_by=right_by, suffixes=suffixes, tolerance=tolerance, allow_exact_matches=allow_exact_matches, direction=direction, ))
def _read(cls, path_or_buf, **kwargs): """ Load an h5 file from the file path or buffer, returning a query compiler. Parameters ---------- path_or_buf : str, buffer or path object Path to the file to open, or an open :class:`pandas.HDFStore` object. **kwargs : dict Pass into pandas.read_hdf function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ if cls._validate_hdf_format(path_or_buf=path_or_buf) is None: ErrorMessage.default_to_pandas( "File format seems to be `fixed`. For better distribution consider " + "saving the file in `table` format. df.to_hdf(format=`table`)." ) return cls.single_worker_read(path_or_buf, **kwargs) columns = kwargs.pop("columns", None) # Have to do this because of Dask's keyword arguments kwargs["_key"] = kwargs.pop("key", None) if not columns: start = kwargs.pop("start", None) stop = kwargs.pop("stop", None) empty_pd_df = pandas.read_hdf(path_or_buf, start=0, stop=0, **kwargs) if start is not None: kwargs["start"] = start if stop is not None: kwargs["stop"] = stop columns = empty_pd_df.columns return cls.build_query_compiler(path_or_buf, columns, **kwargs)
def to_pickle( cls, obj: Any, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): # noqa: PR01, D200 """ Pickle (serialize) object to file. """ ErrorMessage.default_to_pandas("`to_pickle`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return pandas.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def get_dummies( data, prefix=None, prefix_sep="_", dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None, ): if sparse: raise NotImplementedError("SparseDataFrame is not implemented. " "To contribute to Modin, please visit " "github.com/modin-project/modin.") if not isinstance(data, DataFrame): ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame") if isinstance(data, Series): data = data._to_pandas() return DataFrame( pandas.get_dummies( data, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, columns=columns, sparse=sparse, drop_first=drop_first, dtype=dtype, )) else: new_manager = data._query_compiler.get_dummies( columns, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, drop_first=drop_first, dtype=dtype, ) return DataFrame(query_compiler=new_manager)
def read_html( cls, io, match=".+", flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=",", encoding=None, decimal=".", converters=None, na_values=None, keep_default_na=True, displayed_only=True, ): ErrorMessage.default_to_pandas("`read_html`") kwargs = { "io": io, "match": match, "flavor": flavor, "header": header, "index_col": index_col, "skiprows": skiprows, "attrs": attrs, "parse_dates": parse_dates, "tupleize_cols": tupleize_cols, "thousands": thousands, "encoding": encoding, "decimal": decimal, "converters": converters, "na_values": na_values, "keep_default_na": keep_default_na, "displayed_only": displayed_only, } return cls.from_pandas(pandas.read_html(**kwargs)[0])
def read_gbq( cls, query: str, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=False, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, private_key=None, verbose=None, progress_bar_type=None, max_results=None, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_gbq`") return cls.from_pandas( pandas.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration, credentials=credentials, use_bqstorage_api=use_bqstorage_api, private_key=private_key, verbose=verbose, progress_bar_type=progress_bar_type, max_results=max_results, ) )
def _index_grouped(self): if self._index_grouped_cache is None: if self._is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") self._index_grouped_cache = { k: v.index for k, v in self._df._query_compiler.getitem_column_array( self._by).to_pandas().groupby(by=self._by) } else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze() else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def to_pickle(cls, obj, path, compression="infer", protocol=4): # noqa: PR01 """ Pickle (serialize) object to file using pandas. For parameters description please refer to pandas API. """ if protocol == 4: protocol = -1 ErrorMessage.default_to_pandas("`to_pickle`") if isinstance(obj, BaseQueryCompiler): return pandas.to_pickle(obj.to_pandas(), path, compression=compression, protocol=protocol) else: return pandas.to_pickle(obj, path, compression=compression, protocol=protocol)
def read_sql_table( cls, table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, ): ErrorMessage.default_to_pandas("`read_sql_table`") return cls.from_pandas( pandas.read_sql_table( table_name, con, schema=schema, index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, columns=columns, chunksize=chunksize, ))
def return_handler(*args, **kwargs): """ Replace the default behavior of methods with inplace kwarg. Returns ------- A Modin DataFrame in place of a pandas DataFrame, or the same return type as pandas.HDFStore. Notes ----- This function will replace all of the arguments passed to methods of HDFStore with the pandas equivalent. It will convert Modin DataFrame to pandas DataFrame, etc. Currently, pytables does not accept Modin DataFrame objects, so we must convert to pandas. """ from modin.utils import to_pandas # We don't want to constantly be giving this error message for # internal methods. if item[0] != "_": ErrorMessage.default_to_pandas("`{}`".format(item)) args = [ to_pandas(arg) if isinstance(arg, DataFrame) else arg for arg in args ] kwargs = { k: to_pandas(v) if isinstance(v, DataFrame) else v for k, v in kwargs.items() } obj = super(HDFStore, self).__getattribute__(item)(*args, **kwargs) if self._return_modin_dataframe and isinstance( obj, pandas.DataFrame): return DataFrame(obj) return obj
def read_hdf( cls, path_or_buf, key=None, mode: str = "r", errors: str = "strict", where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs, ): # noqa: PR01 from modin.pandas.io import HDFStore ErrorMessage.default_to_pandas("`read_hdf`") modin_store = isinstance(path_or_buf, HDFStore) if modin_store: path_or_buf._return_modin_dataframe = False df = pandas.read_hdf( path_or_buf, key=key, mode=mode, columns=columns, errors=errors, where=where, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs, ) if modin_store: path_or_buf._return_modin_dataframe = True return cls.from_pandas(df)
def single_worker_read(cls, fname, **kwargs): """ Perform reading by single worker (default-to-pandas implementation). Parameters ---------- fname : str, path object or file-like object Name of the file or file-like object to read. **kwargs : dict Keywords arguments to be passed into `read_*` function. Returns ------- BaseQueryCompiler or dict or pandas.io.parsers.TextFileReader Object with imported data (or with reference to data) for furher processing, object type depends on the child class `parse` function result type. """ ErrorMessage.default_to_pandas("Parameters provided") # Use default args for everything pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read pandas_frame.read = ( lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( pd_read(*args, **kwargs), cls.frame_cls ) ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): return { i: cls.query_compiler_cls.from_pandas(frame, cls.frame_cls) for i, frame in pandas_frame.items() } return cls.query_compiler_cls.from_pandas(pandas_frame, cls.frame_cls)
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): # noqa: PR01, RT01, D200 """ Read SQL query or database table into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if kwargs.get("chunksize") is not None: ErrorMessage.default_to_pandas("Parameters provided [chunksize]") df_gen = pandas.read_sql(**kwargs) return (DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) for df in df_gen) return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
def _read(cls, filepath_or_buffer, **kwargs): """Read csv file from local disk. Args: filepath_or_buffer: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas.read_csv """ # The intention of the inspection code is to reduce the amount of # communication we have to do between processes and nodes. We take a quick # pass over the arguments and remove those that are default values so we # don't have to serialize and send them to the workers. Because the # arguments list is so long, this does end up saving time based on the # number of nodes in the cluster. try: args, _, _, defaults, _, _, _ = inspect.getfullargspec( cls.read_csv) defaults = dict(zip(args[2:], defaults)) filtered_kwargs = { kw: kwargs[kw] for kw in kwargs if kw in defaults and not isinstance(kwargs[kw], type(defaults[kw])) or kwargs[kw] != defaults[kw] } # This happens on Python2, we will just default to serializing the entire dictionary except AttributeError: filtered_kwargs = kwargs if isinstance(filepath_or_buffer, str): if not os.path.exists(filepath_or_buffer): ErrorMessage.default_to_pandas("File not found on disk") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) elif not isinstance(filepath_or_buffer, py.path.local): read_from_pandas = True # Pandas read_csv supports pathlib.Path try: import pathlib if isinstance(filepath_or_buffer, pathlib.Path): read_from_pandas = False except ImportError: pass if read_from_pandas: ErrorMessage.default_to_pandas("Reading from buffer.") return cls._read_csv_from_pandas(filepath_or_buffer, kwargs) if (_infer_compression(filepath_or_buffer, kwargs.get("compression")) is not None): ErrorMessage.default_to_pandas("Compression detected.") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: ErrorMessage.default_to_pandas("Reading chunks from a file.") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): ErrorMessage.default_to_pandas( "skiprows parameter not optimized yet.") return cls._read_csv_from_pandas(filepath_or_buffer, kwargs) # TODO: replace this by reading lines from file. if kwargs.get("nrows") is not None: ErrorMessage.default_to_pandas("`read_csv` with `nrows`") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) else: return cls._read_csv_from_file_pandas_on_ray( filepath_or_buffer, filtered_kwargs)
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, ): kwargs = { "filepath_or_buffer": filepath_or_buffer, "sep": sep, "delimiter": delimiter, "header": header, "names": names, "index_col": index_col, "usecols": usecols, "squeeze": squeeze, "prefix": prefix, "mangle_dupe_cols": mangle_dupe_cols, "dtype": dtype, "engine": engine, "converters": converters, "true_values": true_values, "false_values": false_values, "skipinitialspace": skipinitialspace, "skiprows": skiprows, "nrows": nrows, "na_values": na_values, "keep_default_na": keep_default_na, "na_filter": na_filter, "verbose": verbose, "skip_blank_lines": skip_blank_lines, "parse_dates": parse_dates, "infer_datetime_format": infer_datetime_format, "keep_date_col": keep_date_col, "date_parser": date_parser, "dayfirst": dayfirst, "iterator": iterator, "chunksize": chunksize, "compression": compression, "thousands": thousands, "decimal": decimal, "lineterminator": lineterminator, "quotechar": quotechar, "quoting": quoting, "escapechar": escapechar, "comment": comment, "encoding": encoding, "dialect": dialect, "tupleize_cols": tupleize_cols, "error_bad_lines": error_bad_lines, "warn_bad_lines": warn_bad_lines, "skipfooter": skipfooter, "doublequote": doublequote, "delim_whitespace": delim_whitespace, "low_memory": low_memory, "memory_map": memory_map, "float_precision": float_precision, } ErrorMessage.default_to_pandas() return cls._read(**kwargs)
def read_pickle(cls, path, compression="infer"): ErrorMessage.default_to_pandas() return cls.from_pandas( pandas.read_pickle(path, compression=compression))
def read_feather(cls, path, nthreads=1): ErrorMessage.default_to_pandas() return cls.from_pandas(pandas.read_feather(path, nthreads))
def read_msgpack(cls, path_or_buf, encoding="utf-8", iterator=False): ErrorMessage.default_to_pandas() return cls.from_pandas( pandas.read_msgpack(path_or_buf, encoding=encoding, iterator=iterator))
def read_hdf(cls, path_or_buf, key=None, mode="r", columns=None): ErrorMessage.default_to_pandas() return cls.from_pandas( pandas.read_hdf(path_or_buf, key=key, mode=mode, columns=columns))
def read_clipboard(cls, sep=r"\s+"): ErrorMessage.default_to_pandas() return cls.from_pandas(pandas.read_clipboard(sep=sep))
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) if isinstance(dtype, dict): column_types = { c: cls._dtype_to_arrow(t) for c, t in dtype.items() } else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names: if header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 elif header is None or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values" ) else: if header == 0 or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values" ) if delimiter is None: delimiter = sep if delim_whitespace and delimiter != ",": raise ValueError( "Specified a delimiter and delim_whitespace=True; you can only specify one." ) usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except (pa.ArrowNotImplementedError, NotImplementedError): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)
def read_json( cls, path_or_buf=None, orient=None, typ="frame", dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression="infer", ): kwargs = { "path_or_buf": path_or_buf, "orient": orient, "typ": typ, "dtype": dtype, "convert_axes": convert_axes, "convert_dates": convert_dates, "keep_default_dates": keep_default_dates, "numpy": numpy, "precise_float": precise_float, "date_unit": date_unit, "encoding": encoding, "lines": lines, "chunksize": chunksize, "compression": compression, } if cls.read_json_remote_task is None: return super(RayIO, cls).read_json(**kwargs) if not lines: ErrorMessage.default_to_pandas( "`read_json` only optimized with `lines=True`") return super(RayIO, cls).read_json(**kwargs) else: # TODO: Pick up the columns in an optimized way from all data # All rows must be read because some rows may have missing data # Currently assumes all rows have the same columns from io import BytesIO columns = pandas.read_json( BytesIO(b"" + open(path_or_buf, "rb").readline()), lines=True).columns kwargs["columns"] = columns empty_pd_df = pandas.DataFrame(columns=columns) path_or_buf = kwargs.pop("path_or_buf") with file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: total_bytes = file_size(f) num_partitions = cls.frame_mgr_cls._compute_num_partitions() num_splits = min(len(columns), num_partitions) chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) partition_ids = [] index_ids = [] dtypes_ids = [] column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(columns): column_widths = [len(columns)] num_splits = 1 else: column_widths = [ column_chunksize if i != num_splits - 1 else len(columns) - (column_chunksize * (num_splits - 1)) for i in range(num_splits) ] while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() partition_id = cls.read_json_remote_task._remote( args=(path_or_buf, num_splits, start, f.tell(), kwargs), num_return_vals=num_splits + 3, ) partition_ids.append(partition_id[:-3]) index_ids.append(partition_id[-3]) dtypes_ids.append(partition_id[-2]) row_lengths = ray.get(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply( lambda row: find_common_type(row.values), axis=1).squeeze(axis=0)) partition_ids = [[ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids))] if isinstance(dtypes, pandas.Series): dtypes.index = columns else: dtypes = pandas.Series(dtypes, index=columns) new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(np.array(partition_ids)), new_index, columns, dtypes=dtypes, ) return new_query_compiler
def _compute_index_grouped(self, numerical=False): """ Construct an index of group IDs. Parameters ---------- numerical : bool, default: False Whether a group indices should be positional (True) or label-based (False). Returns ------- dict A dict of {group name -> group indices} values. See Also -------- pandas.core.groupby.GroupBy.groups """ # We end up using pure pandas to compute group indices, so raising a warning ErrorMessage.default_to_pandas("Group indices computation") # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) # `dropna` param is the only one that matters for the group indices result dropna = self._kwargs.get("dropna", True) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by): pandas_df = self._df._query_compiler.getitem_column_array( by).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) groupby_obj = pandas_df.groupby(by=by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by axis_labels = self._query_compiler.get_axis(self._axis) if numerical: # Since we want positional indices of the groups, we want to group # on a `RangeIndex`, not on the actual index labels axis_labels = pandas.RangeIndex(len(axis_labels)) # `pandas.Index.groupby` doesn't take any parameters except `by`. # Have to convert an Index to a Series to be able to process `dropna=False`: if dropna: return axis_labels.groupby(by) else: groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups
def read_orc(path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") return DataFrame(pandas.read_orc(path, columns, **kwargs))
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache