def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) path = stringify_path(path) parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") parquet_kwargs["open_with"] = lambda path, _: fsspec.open( path, "rb", **(storage_options or {}) ).open() elif isinstance(path, str) and not os.path.isdir(path): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path, "rb", is_text=False) path = handles.handle parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) if handles is not None: handles.close() return result
def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") import fsspec.core fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: if storage_options: raise ValueError( "storage_options passed with buffer or non-fsspec filepath" ) fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) if not fs: path, _, _, should_close = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( path, columns=columns, filesystem=fs, **kwargs ).to_pandas() if should_close: path.close() return result
def _get_path_or_handle( path: FilePathOrBuffer, fs: Any, storage_options: StorageOptions = None, mode: str = "rb", is_dir: bool = False, ) -> tuple[FilePathOrBuffer, IOHandles | None, Any]: """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: fsspec = import_optional_dependency("fsspec") fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle, **(storage_options or {})) elif storage_options and (not is_url(path_or_handle) or mode != "rb"): # can't write to a remote url # without making use of fsspec at the moment raise ValueError( "storage_options passed with buffer, or non-supported URL") handles = None if (not fs and not is_dir and isinstance(path_or_handle, str) and not os.path.isdir(path_or_handle)): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path_or_handle, mode, is_text=False, storage_options=storage_options) fs = None path_or_handle = handles.handle return path_or_handle, handles, fs
def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. JSON string This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) if (not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer)): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle return filepath_or_buffer
def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") assert icom.is_fsspec_url("gs://pandas/somethingelse.com") # the following is the only remote URL that is handled without fsspec assert not icom.is_fsspec_url("http://pandas/somethingelse.com") assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") # fsspec URL in string should not be recognized assert not icom.is_fsspec_url("this is not fsspec://url") assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}") # accept everything that conforms to RFC 3986 schema assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
def read(self, path, columns=None, **kwargs): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") open_with = lambda path, _: fsspec.open(path, "rb").open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def _check_source_format(self, src): src = self.source if is_url(src): fmt = 'url' elif is_file_like(src): fmt = 'filelike' elif is_fsspec_url(src): fmt = 's3' else: fmt = 'invalid' return fmt
def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") assert icom.is_fsspec_url("gs://pandas/somethingelse.com") # the following is the only remote URL that is handled without fsspec assert not icom.is_fsspec_url("http://pandas/somethingelse.com") assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path")
def write( self, df: DataFrame, path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): self.validate_dataframe(df) from_pandas_kwargs: Dict[str, Any] = { "schema": kwargs.pop("schema", None) } if index is not None: from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) path = stringify_path(path) # get_handle could be used here (for write_table, not for write_to_dataset) # but it would complicate the code. if is_fsspec_url(path) and "filesystem" not in kwargs: # make fsspec instance, which pyarrow will use to open paths fsspec = import_optional_dependency("fsspec") fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( table, path, compression=compression, partition_cols=partition_cols, **kwargs, ) else: # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs)
def write( self, df: DataFrame, path, compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " "partition_cols. Use partition_cols for partitioning data" ) elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") if partition_cols is not None: kwargs["file_scheme"] = "hive" # cannot use get_handle as write() does not accept file buffers path = stringify_path(path) if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open( path, "wb", **(storage_options or {}) ).open() elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) with catch_warnings(record=True): self.api.write( path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs, )
def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): parquet_kwargs = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) # Technically works with 0.7.0, but was incorrect # so lets just require 0.7.1 if Version(self.api.__version__) >= Version("0.7.1"): # Need to set even for use_nullable_dtypes = False, # since our defaults differ parquet_kwargs["pandas_nulls"] = use_nullable_dtypes else: if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine for fastparquet versions less than 0.7.1" ) path = stringify_path(path) handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") if Version(self.api.__version__) > Version("0.6.1"): parquet_kwargs["fs"] = fsspec.open( path, "rb", **(storage_options or {}) ).fs else: parquet_kwargs["open_with"] = lambda path, _: fsspec.open( path, "rb", **(storage_options or {}) ).open() elif isinstance(path, str) and not os.path.isdir(path): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) if handles is not None: handles.close() return result
def get_data_from_filepath( filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], encoding, compression: CompressionOptions, storage_options: StorageOptions, ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: """ Extract raw XML data. The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ if not isinstance(filepath_or_buffer, bytes): filepath_or_buffer = stringify_path(filepath_or_buffer) if ( isinstance(filepath_or_buffer, str) and not filepath_or_buffer.startswith(("<?xml", "<")) ) and ( not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer) ): with get_handle( filepath_or_buffer, "r", encoding=encoding, compression=compression, storage_options=storage_options, ) as handle_obj: filepath_or_buffer = ( # error: Incompatible types in assignment (expression has type # "Union[str, IO[str]]", variable has type "Union[Union[str, # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]") handle_obj.handle.read() # type: ignore[assignment] if hasattr(handle_obj.handle, "read") else handle_obj.handle ) return filepath_or_buffer
def write( self, df: DataFrame, path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, **kwargs, ): self.validate_dataframe(df) from_pandas_kwargs: Dict[str, Any] = { "schema": kwargs.pop("schema", None) } if index is not None: from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if is_fsspec_url(path) and "filesystem" not in kwargs: # make fsspec instance, which pyarrow will use to open paths import_optional_dependency("fsspec") import fsspec.core fs, path = fsspec.core.url_to_fs(path) kwargs["filesystem"] = fs else: path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( table, path, compression=compression, partition_cols=partition_cols, **kwargs, ) else: # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs)
def read(self, path, columns=None, storage_options: StorageOptions = None, **kwargs) -> DataFrame: parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) if Version(self.api.__version__) >= Version("0.7.1"): # We are disabling nullable dtypes for fastparquet pending discussion parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine") path = stringify_path(path) handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") if Version(self.api.__version__) > Version("0.6.1"): parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs else: parquet_kwargs["open_with"] = lambda path, _: fsspec.open( path, "rb", **(storage_options or {})).open() elif isinstance(path, str) and not os.path.isdir(path): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path, "rb", is_text=False, storage_options=storage_options) path = handles.handle try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) return parquet_file.to_pandas(columns=columns, **kwargs) finally: if handles is not None: handles.close()
def write( self, df: DataFrame, path, compression="snappy", index=None, partition_cols=None, **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " "partition_cols. Use partition_cols for partitioning data") elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") if partition_cols is not None: kwargs["file_scheme"] = "hive" if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open( ) else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write( path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs, )
def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. JSON string This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. It raises FileNotFoundError if the input is a string ending in one of .json, .json.gz, .json.bz2, etc. but no such file exists. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) if ( not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer) ): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( (".json",) + tuple(f".json{c}" for c in _extension_to_compression) ) and not file_exists(filepath_or_buffer) ): raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") return filepath_or_buffer
def get_data_from_filepath( filepath_or_buffer, encoding, compression, storage_options, ) -> Union[str, bytes, Buffer]: """ Extract raw XML data. The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if (isinstance(filepath_or_buffer, str) and not filepath_or_buffer.startswith( ("<?xml", "<"))) and (not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer)): with get_handle( filepath_or_buffer, "r", encoding=encoding, compression=compression, storage_options=storage_options, ) as handle_obj: filepath_or_buffer = (handle_obj.handle.read() if hasattr( handle_obj.handle, "read") else handle_obj.handle) return filepath_or_buffer
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: """ Iterparse xml nodes. This method will read in local disk, decompressed XML files for elements and underlying descendants using iterparse, a method to iterate through an XML tree without holding entire XML tree in memory. Raises ------ TypeError * If `iterparse` is not a dict or its dict value is not list-like. ParserError * If `path_or_buffer` is not a physical, decompressed file on disk. * If no data is returned from selected items in `iterparse`. Notes ----- Namespace URIs will be removed from return node values. Also, elements with missing children or attributes in submitted list will have optional keys filled with None values. """ dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None if not isinstance(self.iterparse, dict): raise TypeError( f"{type(self.iterparse).__name__} is not a valid type for iterparse" ) row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" if not is_list_like(self.iterparse[row_node]): raise TypeError( f"{type(self.iterparse[row_node])} is not a valid type " "for value in iterparse" ) if ( not isinstance(self.path_or_buffer, str) or is_url(self.path_or_buffer) or is_fsspec_url(self.path_or_buffer) or self.path_or_buffer.startswith(("<?xml", "<")) or infer_compression(self.path_or_buffer, "infer") is not None ): raise ParserError( "iterparse is designed for large XML files that are fully extracted on " "local disk and not as compressed files or online sources." ) for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag if event == "start": if curr_elem == row_node: row = {} if row is not None: if self.names: for col, nm in zip(self.iterparse[row_node], self.names): if curr_elem == col: elem_val = elem.text.strip() if elem.text else None if row.get(nm) != elem_val and nm not in row: row[nm] = elem_val if col in elem.attrib: if elem.attrib[col] not in row.values() and nm not in row: row[nm] = elem.attrib[col] else: for col in self.iterparse[row_node]: if curr_elem == col: row[col] = elem.text.strip() if elem.text else None if col in elem.attrib: row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: dicts.append(row) row = None elem.clear() if hasattr(elem, "getprevious"): while ( elem.getprevious() is not None and elem.getparent() is not None ): del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] if self.names: dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] return dicts
def _read(cls, path, engine, columns, **kwargs): """ Load a parquet object from the file path, returning a query compiler. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : str Parquet library to use (only 'PyArrow' is supported for now). columns : list If not None, only these columns will be read from the file. **kwargs : dict Keyword arguments. Returns ------- BaseQueryCompiler A new Query Compiler. Notes ----- ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if isinstance(path, str) and os.path.isdir(path): partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet") return cls.single_worker_read( path, engine=engine, columns=columns, **kwargs ) if not columns: import fsspec.core from pandas.io.common import is_fsspec_url fs, path_ = ( fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {})) if is_fsspec_url(path) else (None, path) ) dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False) column_names = dataset.schema.names if dataset.schema.pandas_metadata is not None: index_columns = dataset.schema.pandas_metadata.get("index_columns", []) column_names = [c for c in column_names if c not in index_columns] columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)] return cls.build_query_compiler(path, columns, **kwargs)
def read(self, path, columns=None, storage_options: StorageOptions = None, **kwargs): path = stringify_path(path) handles = None fs = kwargs.pop("filesystem", None) if is_fsspec_url(path) and fs is None: fsspec = import_optional_dependency("fsspec") fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) elif storage_options: raise ValueError( "storage_options passed with buffer or non-fsspec filepath") if not fs and isinstance(path, str) and not os.path.isdir(path): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path, "rb", is_text=False) path = handles.handle kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table(path, columns=columns, filesystem=fs, **kwargs).to_pandas() if handles is not None: handles.close() return result