def ls(path: str, full_path: bool = False, recursive: bool = False, **kwargs) -> List[str]: """ List the contents of a local/s3 directory Parameters ----------- path : str Local or S3 Path full_path : bool Include the full path, or just the path relative to `path` recursive : bool Recursively list within the given path kwargs : Dict If path is an s3 path, fs: s3fs.S3FileSystem can be specified Returns -------- List[str] """ if s3.is_s3path(path): return s3.ls(path, full_path, recursive, **kwargs) else: return local.ls(path, full_path, recursive)
def save_parquet_fp(df: pd.DataFrame, path: str, **kwargs) -> None: """ Helper function to save a DataFrame to a parquet DataSet See the [fastparquet Docs](https://fastparquet.readthedocs.io/en/latest/api.html) for more information Parameters ----------- df : pd.DataFrame The DataFrame to export to parquet path : str The root path the save the DataFrame to, this can either be S3 or local Additional Parameters ---------------------- The following parameters are optional and can tweak how the DataFrame gets converted to parquet. fs : s3fs.S3FileSystem This will be used to save the data to S3 if applicable file_scheme: "simple"|"hive" (default "hive") If simple: all goes in a single file If hive: each row group is in a separate file, and a separate file (called "_metadata") contains the metadata. write_index: bool Whether or not to write the index to a separate column. By default we write the index *if* it is not 0, 1, ..., n. partition_on: List[str] Passed to groupby in order to split data within each row-group, producing a structured directory tree. Note: as with pandas, null values will be dropped. Ignored if file_scheme is simple. See [fastparquet.write](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write) documentation for full details. Returns -------- None """ import fastparquet as fp fs = kwargs.pop("fs", None) file_scheme = kwargs.pop("file_scheme", "hive") if s3.is_s3path(path): fs = fs or s3fs.S3FileSystem() myopen = fs.open else: myopen = open logger.info("Writing Dataframe to Parquet using fastparquet") fp.write(path, df, file_scheme=file_scheme, open_with=myopen, **kwargs) logger.info("Done.")
def cp(from_path: str, to_path: str, overwrite: bool = True, include_folder_name: bool = True, **kwargs) -> None: """ Copy a file or directory of files from local/s3 to local/s3 Parameters ----------- from_path : str Directory/file path to copy to_path : str Path to copy file(s) to. overwrite : bool (default True) Should the to_path be overwritten if it already exists? include_folder_name : bool (default True) If copying a directory, add the directory name automatically to the to_path. i.e. if True, the entire folder will be copied to the to_path. If False, the *contents* of the directory will be copied to the to_path kwargs : Dict Extra arguments to pass to the appropriate cp (either _local.cp or _s3.cp) Returns -------- None """ if s3.is_s3path(from_path) or s3.is_s3path(to_path): s3.cp(from_path, to_path, overwrite, include_folder_name, **kwargs) else: local.cp(from_path, to_path, overwrite, include_folder_name)
def already_exists(path: str, **kwargs) -> bool: """ Check if a file/directory already exists Parameters ----------- path : str File / Directory path kwargs : Dict If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified Returns -------- bool """ if s3.is_s3path(path): return s3.already_exists(path, **kwargs) else: return local.already_exists(path)
def load_parquet_fp(path: str, **kwargs) -> pd.DataFrame: """ Helper function to load a parquet Dataset as a Pandas DataFrame using fastparquet First creates a [ParquetFile](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile) and then converts the ParquetFile to a DataFrame using .to_pandas. Refer to the fastparquet documentation for accepted arguments Parameters ----------- path : str The root directory of the Parquet Dataset stored locally or in S3 Returns -------- pd.DataFrame """ import fastparquet as fp logger.info( f"Reading in Parquet dataset to ParquetFile. kwargs passed {kwargs!r}") fs = kwargs.pop("fs", None) # Pull out arguments that should be directed to to_pandas to_pandas_args = parse_args(fp, ["ParquetFile", "to_pandas"], **kwargs) # Remove these args from kwargs kwargs = { k: v for k, v in kwargs.items() if k in set(kwargs) - set(to_pandas_args) } if s3.is_s3path(path): fs = fs or s3fs.S3FileSystem() myopen = fs.open else: myopen = open pf = fp.ParquetFile(path, open_with=myopen, **kwargs) df = pf.to_pandas(**to_pandas_args) return df
def get_size(path: str, **kwargs) -> int: """ Return size of file/directory in bytes Parameters ----------- path : str File / Directory path kwargs : Dict If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified Returns -------- int """ fs = kwargs.pop("fs", None) if s3.is_s3path(path): return s3.get_size(path, fs) else: return local.get_size(path)
def rm(path: str, dry_run: bool = False, **kwargs) -> None: """ Deletes a file or directory Parameters ----------- path : str File path to delete dry_run : bool Print out number of files to be deleted and exit. If False, numbe of files to be deleted will be logged and files will be removed kwargs : Dict If path is an s3 path, fs: s3fs.S3FileSystem can be specified Returns -------- None """ if s3.is_s3path(path): s3.rm(path, dry_run, **kwargs) else: local.rm(path, dry_run)
def test_is_s3path(tmpdir): path = "s3://airdna-data/scratch/ewellinger" assert s3.is_s3path(path) assert not s3.is_s3path(tmpdir)
def save_parquet_pa(df: pd.DataFrame, path: str, **kwargs) -> None: """ Helper function to save a DataFrame to a parquet DataSet See the [PyArrow Docs](https://arrow.apache.org/docs/python/index.html) for more information Parameters ----------- df : pd.DataFrame The DataFrame to export to parquet path : str The root path the save the DataFrame to, this can either be S3 or local Additional Parameters ---------------------- The following parameters are optional and can tweak how the DataFrame gets converted to parquet. fs : s3fs.S3FileSystem This will be used to save the data to S3 if applicable schema : pyarrow.Schema Passed to pyarrow.Table.from_pandas() The expected schema of the Arrow Table. This can be used to indicate the type of columns if we cannot infer it automatically. preserve_index : bool (default False) Passed to pyarrow.Table.from_pandas() Whether to store the index as an additional column in the resulting Table nthreads : int Passed to pyarrow.Table.from_pandas() If greater than 1, convert columns to Arrow in parallel using indicated number of threads columns : List[str] Passed to pyarrow.Table.from_pandas() List of columns to be converted. Uses all columns be default partition_cols : List[str] Passed to pyarrow.parquet.write_to_dataset() Column names by which to partition the dataset Columns are partitioned in the order that they are given Returns -------- None """ import pyarrow as pa import pyarrow.parquet as pq logger.info( f"Converting dataframe to PyArrow Table. kwargs passed {kwargs!r}") fs = kwargs.pop("fs", None) schema = kwargs.pop("schema", None) preserve_index = kwargs.pop("preserve_index", False) nthreads = kwargs.pop("nthreads", None) columns = kwargs.pop("columns", None) partition_cols = kwargs.pop("partition_cols", None) # Convert the dataframe into a pyArrow Table object table = pa.Table.from_pandas(df, schema=schema, preserve_index=preserve_index, nthreads=nthreads, columns=columns) if not s3.is_s3path(path): fs = None elif fs is None: fs = s3fs.S3FileSystem() logger.info("Writing Arrow Table to Parquet Dataset") pq.write_to_dataset(table, path, partition_cols=partition_cols, filesystem=fs, preserve_index=preserve_index) logger.info("Done.")
def load_parquet_pa(path: str, **kwargs) -> pd.DataFrame: """ Helper function to load a parquet Dataset as a Pandas DataFrame Parameters ----------- path : str The root directory of the Parquet Dataset stored locally or in S3 Additional Parameters ---------------------- The following parameters are optional and can tweak how the Dataset gets converted back to a DataFrame split_row_groups : bool (default False) Passed to pyarrow.parquet.ParquetDataset() Divide files into pieces for each row group in the file filters : List[Tuple] Passed to pyarrow.parquet.ParquetDataset() List of filters to apply, like `[('x', '=', 0), ...]`. This implements partition-level (hive) filtering only, i.e., to prevent the loading of some files of the dataset. columns : List[str] Passed to pyarrow.parquet.ParquetDataset().read() Names of columns to read from the dataset Any additional kwargs are passed to pyarrow.Table.to_pandas(). See [documentation](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html?highlight=table#pyarrow.Table.to_pandas) for more information Returns -------- pd.DataFrame """ import pyarrow.parquet as pq logger.info( f"Reading in Parquet dataset to PyArrow Table. kwargs passed {kwargs!r}" ) fs = kwargs.pop("fs", None) split_row_groups = kwargs.pop("split_row_groups", False) filters = kwargs.pop("filters", None) columns = kwargs.pop("columns", None) if not s3.is_s3path(path): fs = None elif fs is None: fs = s3fs.S3FileSystem() dataset = pq.ParquetDataset(path, filesystem=fs, split_row_groups=split_row_groups, filters=filters) table = dataset.read(columns=columns) logger.info( f"Converting PyArrow Table to Pandas DataFrame. kwargs passed {kwargs!r}" ) return table.to_pandas(**kwargs)
def save_object(obj: object, path: str, file_type: Optional[str] = None, overwrite: bool = True, protocol: int = pickle.HIGHEST_PROTOCOL, **kwargs) -> None: """ Save an object in memory to a file Parameters ----------- obj : object Python object in memory path : str Local or S3 path to save file. If file_type is not specified, an attempt will be made to infer the file_type based on the extension. file_type : str Type of file to save. Supported options are currently: "pickle" Additional kwargs are passed to pickle.dumps "raw" "csv" Save a pandas DataFrame as a CSV file. Additional kwargs are passed to obj.to_csv NOTE: A TypeError will be thrown in "csv" is specified and obj is not a pandas DataFrame "json" Additional kwargs are passed to json.dumps "parquet" Save a pandas DataFrame to a parquet dataset. Additional kwargs are passed to the _save_parquet helper function and are applied to either pa.Table.from_pandas() or pq.write_to_dataset() depending on the argument. NOTE: This functionality is still in beta and currently only works with a pandas dataframe as input. overwrite : bool Should the file be overwritten if it already exists? protocol : int Used when calling pickle kwargs : Dict The following extra parameters can be passed: fs : s3fs.S3FileSystem Used when the path is an s3 path acl : str Used to set the Access Control List settings when writing to S3 Returns -------- None """ fs = kwargs.pop("fs", None) acl = kwargs.pop("acl", "bucket-owner-full-control") # Check to see if path already exists if not overwrite and already_exists(path, fs=fs): raise ValueError(f"overwrite set to False and {path!r} already exists") if file_type is None: file_type = _file_type_helper(path) if file_type == "pickle": logger.info(f"Saving obj as a pickle file. kwargs passed {kwargs!r}") obj = pickle.dumps(obj, protocol=protocol, **kwargs) elif file_type == "raw": logger.info(f"Saving obj as a raw file.") pass elif file_type == "csv": logger.info(f"Saving obj as a CSV file. kwargs passed {kwargs!r}") if not isinstance(obj, pd.DataFrame): raise TypeError( f"obj must be a pandas DataFrame when file_type='csv'. {type(obj)!r} passed" ) obj = obj.to_csv(path_or_buf=None, **kwargs) elif file_type == "json": logger.info(f"Saving obj as a json file. kwargs passed {kwargs!r}") obj = json.dumps(obj, **kwargs) elif file_type == "parquet": if not isinstance(obj, pd.DataFrame): raise TypeError( f"Saving to parquet currently only supports a pandas DataFrame. {type(obj)!r} passed" ) from ._parquet import save_parquet return save_parquet(obj, path, fs=fs, **kwargs) else: raise ValueError(f"file_type={file_type!r} is not supported") # Save file to appropriate system if s3.is_s3path(path): logger.info("Saving object to S3") s3.save_object(obj, path, overwrite, fs, acl) else: logger.info("Saving object to local") path = local._norm_path(path) if isinstance(obj, (bytes, bytearray)): mode = "wb" else: mode = "w" with open(path, mode) as f: f.write(obj)
def load_object(path: str, file_type: Optional[str] = None, **kwargs) -> Any: """ Load a file into memory Parameters ----------- path : str Path to the file. If file_type is not specified, an attempt will be made to infer the file_type based on the extension. file_type : str Type of file to load. Supported options are currently: "pickle" kwargs are passed to pickle.loads "raw" "csv" Load a CSV file into a pandas DataFrame. Additional kwargs are passed to pd.read_csv "json" kwargs are passed to json.loads "parquet" Load a parquet dataset in as a pandas DataFrame. Additional kwargs are passed to _parquet.load_parquet(). See that function for more information NOTE: This functionality is still in beta kwarg : Dict fs : s3fs.S3FileSystem Will be passed to s3.load_object if path is an s3path Returns -------- Any : Depends on the file_type specified """ # Pop fs from kwargs fs = kwargs.pop("fs", None) if file_type is None: file_type = _file_type_helper(path) if file_type == "parquet": from ._parquet import load_parquet return load_parquet(path, fs=fs, **kwargs) if s3.is_s3path(path): logger.info(f"Loading {path!r} from S3") data_file = s3.load_object(path, fs) else: path = local._norm_path(path) logger.info(f"Loading {path!r} from local directory") data_file = open(path, "rb") if file_type == "pickle": logger.info( f"Loading file as a 'pickle' object. kwargs passed {kwargs!r}") data_read = data_file.read() obj = pickle.loads(data_read, **kwargs) elif file_type == "raw": logger.info("Loading file as a 'raw' object") obj = data_file.read() elif file_type == "csv": logger.info("Loading file as a 'csv' object") import pandas as pd obj = pd.read_csv(data_file, **kwargs) elif file_type == "json": logger.info( f"loading file as a 'json' object. kwargs passed {kwargs!r}") obj = json.load(data_file, **kwargs) else: if hasattr(data_file, "close"): logger.info(f"Closing data_file {data_file!r}") data_file.close() raise ValueError(f"File type {file_type!r} is not supported") if hasattr(data_file, "close"): logger.info(f"Closing data_file {data_file!r}") data_file.close() return obj