Exemplo n.º 1
0
def ls(path: str,
       full_path: bool = False,
       recursive: bool = False,
       **kwargs) -> List[str]:
    """ List the contents of a local/s3 directory

    Parameters
    -----------
    path : str
        Local or S3 Path

    full_path : bool
        Include the full path, or just the path relative to `path`

    recursive : bool
        Recursively list within the given path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be specified

    Returns
    --------
    List[str]
    """
    if s3.is_s3path(path):
        return s3.ls(path, full_path, recursive, **kwargs)
    else:
        return local.ls(path, full_path, recursive)
Exemplo n.º 2
0
def save_parquet_fp(df: pd.DataFrame, path: str, **kwargs) -> None:
    """ Helper function to save a DataFrame to a parquet DataSet

    See the [fastparquet Docs](https://fastparquet.readthedocs.io/en/latest/api.html) for more information

    Parameters
    -----------
    df : pd.DataFrame
        The DataFrame to export to parquet

    path : str
        The root path the save the DataFrame to, this can either be S3 or local

    Additional Parameters
    ----------------------
    The following parameters are optional and can tweak how the DataFrame gets
    converted to parquet.

    fs : s3fs.S3FileSystem
        This will be used to save the data to S3 if applicable

    file_scheme: "simple"|"hive" (default "hive")
        If simple: all goes in a single file
        If hive: each row group is in a separate file, and a separate file
        (called "_metadata") contains the metadata.

    write_index: bool
        Whether or not to write the index to a separate column.  By default we
        write the index *if* it is not 0, 1, ..., n.

    partition_on: List[str]
        Passed to groupby in order to split data within each row-group,
        producing a structured directory tree. Note: as with pandas, null
        values will be dropped. Ignored if file_scheme is simple.

    See [fastparquet.write](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write)
    documentation for full details.

    Returns
    --------
    None
    """
    import fastparquet as fp

    fs = kwargs.pop("fs", None)
    file_scheme = kwargs.pop("file_scheme", "hive")

    if s3.is_s3path(path):
        fs = fs or s3fs.S3FileSystem()
        myopen = fs.open
    else:
        myopen = open

    logger.info("Writing Dataframe to Parquet using fastparquet")

    fp.write(path, df, file_scheme=file_scheme, open_with=myopen, **kwargs)

    logger.info("Done.")
Exemplo n.º 3
0
def cp(from_path: str,
       to_path: str,
       overwrite: bool = True,
       include_folder_name: bool = True,
       **kwargs) -> None:
    """ Copy a file or directory of files from local/s3 to local/s3

    Parameters
    -----------
    from_path : str
        Directory/file path to copy

    to_path : str
        Path to copy file(s) to.

    overwrite : bool (default True)
        Should the to_path be overwritten if it already exists?

    include_folder_name : bool (default True)
        If copying a directory, add the directory name automatically to the
        to_path.  i.e. if True, the entire folder will be copied to the
        to_path. If False, the *contents* of the directory will be copied to
        the to_path

    kwargs : Dict
        Extra arguments to pass to the appropriate cp (either _local.cp or
        _s3.cp)

    Returns
    --------
    None
    """
    if s3.is_s3path(from_path) or s3.is_s3path(to_path):
        s3.cp(from_path, to_path, overwrite, include_folder_name, **kwargs)
    else:
        local.cp(from_path, to_path, overwrite, include_folder_name)
Exemplo n.º 4
0
def already_exists(path: str, **kwargs) -> bool:
    """ Check if a file/directory already exists

    Parameters
    -----------
    path : str
        File / Directory path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified

    Returns
    --------
    bool
    """
    if s3.is_s3path(path):
        return s3.already_exists(path, **kwargs)
    else:
        return local.already_exists(path)
Exemplo n.º 5
0
def load_parquet_fp(path: str, **kwargs) -> pd.DataFrame:
    """ Helper function to load a parquet Dataset as a Pandas DataFrame using
        fastparquet

    First creates a [ParquetFile](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile)
    and then converts the ParquetFile to a DataFrame using .to_pandas.
    Refer to the fastparquet documentation for accepted arguments

    Parameters
    -----------
    path : str
        The root directory of the Parquet Dataset stored locally or in S3

    Returns
    --------
    pd.DataFrame
    """
    import fastparquet as fp

    logger.info(
        f"Reading in Parquet dataset to ParquetFile. kwargs passed {kwargs!r}")

    fs = kwargs.pop("fs", None)

    # Pull out arguments that should be directed to to_pandas
    to_pandas_args = parse_args(fp, ["ParquetFile", "to_pandas"], **kwargs)
    # Remove these args from kwargs
    kwargs = {
        k: v
        for k, v in kwargs.items() if k in set(kwargs) - set(to_pandas_args)
    }

    if s3.is_s3path(path):
        fs = fs or s3fs.S3FileSystem()
        myopen = fs.open
    else:
        myopen = open

    pf = fp.ParquetFile(path, open_with=myopen, **kwargs)

    df = pf.to_pandas(**to_pandas_args)
    return df
Exemplo n.º 6
0
def get_size(path: str, **kwargs) -> int:
    """ Return size of file/directory in bytes

    Parameters
    -----------
    path : str
        File / Directory path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified

    Returns
    --------
    int
    """
    fs = kwargs.pop("fs", None)
    if s3.is_s3path(path):
        return s3.get_size(path, fs)
    else:
        return local.get_size(path)
Exemplo n.º 7
0
def rm(path: str, dry_run: bool = False, **kwargs) -> None:
    """ Deletes a file or directory

    Parameters
    -----------
    path : str
        File path to delete

    dry_run : bool
        Print out number of files to be deleted and exit. If False, numbe of
        files to be deleted will be logged and files will be removed

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be specified

    Returns
    --------
    None
    """
    if s3.is_s3path(path):
        s3.rm(path, dry_run, **kwargs)
    else:
        local.rm(path, dry_run)
Exemplo n.º 8
0
def test_is_s3path(tmpdir):
    path = "s3://airdna-data/scratch/ewellinger"
    assert s3.is_s3path(path)
    assert not s3.is_s3path(tmpdir)
Exemplo n.º 9
0
def save_parquet_pa(df: pd.DataFrame, path: str, **kwargs) -> None:
    """ Helper function to save a DataFrame to a parquet DataSet

    See the [PyArrow Docs](https://arrow.apache.org/docs/python/index.html) for more information

    Parameters
    -----------
    df : pd.DataFrame
        The DataFrame to export to parquet

    path : str
        The root path the save the DataFrame to, this can either be S3 or local

    Additional Parameters
    ----------------------
    The following parameters are optional and can tweak how the DataFrame gets
    converted to parquet.

    fs : s3fs.S3FileSystem
        This will be used to save the data to S3 if applicable

    schema : pyarrow.Schema
        Passed to pyarrow.Table.from_pandas()
        The expected schema of the Arrow Table. This can be used to indicate
        the type of columns if we cannot infer it automatically.

    preserve_index : bool (default False)
        Passed to pyarrow.Table.from_pandas()
        Whether to store the index as an additional column in the resulting Table

    nthreads : int
        Passed to pyarrow.Table.from_pandas()
        If greater than 1, convert columns to Arrow in parallel using indicated
        number of threads

    columns : List[str]
        Passed to pyarrow.Table.from_pandas()
        List of columns to be converted. Uses all columns be default

    partition_cols : List[str]
        Passed to pyarrow.parquet.write_to_dataset()
        Column names by which to partition the dataset
        Columns are partitioned in the order that they are given

    Returns
    --------
    None
    """
    import pyarrow as pa
    import pyarrow.parquet as pq

    logger.info(
        f"Converting dataframe to PyArrow Table. kwargs passed {kwargs!r}")

    fs = kwargs.pop("fs", None)
    schema = kwargs.pop("schema", None)
    preserve_index = kwargs.pop("preserve_index", False)
    nthreads = kwargs.pop("nthreads", None)
    columns = kwargs.pop("columns", None)
    partition_cols = kwargs.pop("partition_cols", None)

    # Convert the dataframe into a pyArrow Table object
    table = pa.Table.from_pandas(df,
                                 schema=schema,
                                 preserve_index=preserve_index,
                                 nthreads=nthreads,
                                 columns=columns)

    if not s3.is_s3path(path):
        fs = None
    elif fs is None:
        fs = s3fs.S3FileSystem()

    logger.info("Writing Arrow Table to Parquet Dataset")

    pq.write_to_dataset(table,
                        path,
                        partition_cols=partition_cols,
                        filesystem=fs,
                        preserve_index=preserve_index)

    logger.info("Done.")
Exemplo n.º 10
0
def load_parquet_pa(path: str, **kwargs) -> pd.DataFrame:
    """ Helper function to load a parquet Dataset as a Pandas DataFrame

    Parameters
    -----------
    path : str
        The root directory of the Parquet Dataset stored locally or in S3

    Additional Parameters
    ----------------------
    The following parameters are optional and can tweak how the Dataset gets
    converted back to a DataFrame

    split_row_groups : bool (default False)
        Passed to pyarrow.parquet.ParquetDataset()
        Divide files into pieces for each row group in the file

    filters : List[Tuple]
        Passed to pyarrow.parquet.ParquetDataset()
        List of filters to apply, like `[('x', '=', 0), ...]`. This implements
        partition-level (hive) filtering only, i.e., to prevent the loading of
        some files of the dataset.

    columns : List[str]
        Passed to pyarrow.parquet.ParquetDataset().read()
        Names of columns to read from the dataset

    Any additional kwargs are passed to pyarrow.Table.to_pandas().
    See [documentation](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html?highlight=table#pyarrow.Table.to_pandas) for more information

    Returns
    --------
    pd.DataFrame
    """
    import pyarrow.parquet as pq

    logger.info(
        f"Reading in Parquet dataset to PyArrow Table. kwargs passed {kwargs!r}"
    )

    fs = kwargs.pop("fs", None)
    split_row_groups = kwargs.pop("split_row_groups", False)
    filters = kwargs.pop("filters", None)
    columns = kwargs.pop("columns", None)

    if not s3.is_s3path(path):
        fs = None
    elif fs is None:
        fs = s3fs.S3FileSystem()

    dataset = pq.ParquetDataset(path,
                                filesystem=fs,
                                split_row_groups=split_row_groups,
                                filters=filters)

    table = dataset.read(columns=columns)

    logger.info(
        f"Converting PyArrow Table to Pandas DataFrame. kwargs passed {kwargs!r}"
    )

    return table.to_pandas(**kwargs)
Exemplo n.º 11
0
def save_object(obj: object,
                path: str,
                file_type: Optional[str] = None,
                overwrite: bool = True,
                protocol: int = pickle.HIGHEST_PROTOCOL,
                **kwargs) -> None:
    """ Save an object in memory to a file

    Parameters
    -----------
    obj : object
        Python object in memory

    path : str
        Local or S3 path to save file. If file_type is not specified, an
        attempt will be made to infer the file_type based on the extension.

    file_type : str
        Type of file to save.
        Supported options are currently:
            "pickle"
                Additional kwargs are passed to pickle.dumps
            "raw"
            "csv"
                Save a pandas DataFrame as a CSV file.  Additional kwargs are
                passed to obj.to_csv
                NOTE: A TypeError will be thrown in "csv" is specified and obj
                is not a pandas DataFrame
            "json"
                Additional kwargs are passed to json.dumps
            "parquet"
                Save a pandas DataFrame to a parquet dataset. Additional kwargs
                are passed to the _save_parquet helper function and are applied
                to either pa.Table.from_pandas() or pq.write_to_dataset()
                depending on the argument.
                NOTE: This functionality is still in beta and currently only works with a pandas dataframe as input.

    overwrite : bool
        Should the file be overwritten if it already exists?

    protocol : int
        Used when calling pickle

    kwargs : Dict
        The following extra parameters can be passed:
            fs : s3fs.S3FileSystem
                Used when the path is an s3 path
            acl : str
                Used to set the Access Control List settings when writing to S3

    Returns
    --------
    None
    """
    fs = kwargs.pop("fs", None)
    acl = kwargs.pop("acl", "bucket-owner-full-control")

    # Check to see if path already exists
    if not overwrite and already_exists(path, fs=fs):
        raise ValueError(f"overwrite set to False and {path!r} already exists")

    if file_type is None:
        file_type = _file_type_helper(path)

    if file_type == "pickle":
        logger.info(f"Saving obj as a pickle file. kwargs passed {kwargs!r}")
        obj = pickle.dumps(obj, protocol=protocol, **kwargs)
    elif file_type == "raw":
        logger.info(f"Saving obj as a raw file.")
        pass
    elif file_type == "csv":
        logger.info(f"Saving obj as a CSV file. kwargs passed {kwargs!r}")
        if not isinstance(obj, pd.DataFrame):
            raise TypeError(
                f"obj must be a pandas DataFrame when file_type='csv'. {type(obj)!r} passed"
            )
        obj = obj.to_csv(path_or_buf=None, **kwargs)
    elif file_type == "json":
        logger.info(f"Saving obj as a json file. kwargs passed {kwargs!r}")
        obj = json.dumps(obj, **kwargs)
    elif file_type == "parquet":
        if not isinstance(obj, pd.DataFrame):
            raise TypeError(
                f"Saving to parquet currently only supports a pandas DataFrame. {type(obj)!r} passed"
            )
        from ._parquet import save_parquet
        return save_parquet(obj, path, fs=fs, **kwargs)
    else:
        raise ValueError(f"file_type={file_type!r} is not supported")

    # Save file to appropriate system
    if s3.is_s3path(path):
        logger.info("Saving object to S3")
        s3.save_object(obj, path, overwrite, fs, acl)
    else:
        logger.info("Saving object to local")
        path = local._norm_path(path)
        if isinstance(obj, (bytes, bytearray)):
            mode = "wb"
        else:
            mode = "w"

        with open(path, mode) as f:
            f.write(obj)
Exemplo n.º 12
0
def load_object(path: str, file_type: Optional[str] = None, **kwargs) -> Any:
    """ Load a file into memory

    Parameters
    -----------
    path : str
        Path to the file. If file_type is not specified, an attempt will be
        made to infer the file_type based on the extension.

    file_type : str
        Type of file to load.  Supported options are currently:
            "pickle"
                kwargs are passed to pickle.loads
            "raw"
            "csv"
                Load a CSV file into a pandas DataFrame. Additional kwargs are
                passed to pd.read_csv
            "json"
                kwargs are passed to json.loads
            "parquet"
                Load a parquet dataset in as a pandas DataFrame. Additional
                kwargs are passed to _parquet.load_parquet(). See that function
                for more information
                NOTE: This functionality is still in beta

    kwarg : Dict
        fs : s3fs.S3FileSystem
            Will be passed to s3.load_object if path is an s3path

    Returns
    --------
    Any : Depends on the file_type specified
    """
    # Pop fs from kwargs
    fs = kwargs.pop("fs", None)

    if file_type is None:
        file_type = _file_type_helper(path)

    if file_type == "parquet":
        from ._parquet import load_parquet
        return load_parquet(path, fs=fs, **kwargs)

    if s3.is_s3path(path):
        logger.info(f"Loading {path!r} from S3")
        data_file = s3.load_object(path, fs)
    else:
        path = local._norm_path(path)
        logger.info(f"Loading {path!r} from local directory")
        data_file = open(path, "rb")

    if file_type == "pickle":
        logger.info(
            f"Loading file as a 'pickle' object. kwargs passed {kwargs!r}")
        data_read = data_file.read()
        obj = pickle.loads(data_read, **kwargs)
    elif file_type == "raw":
        logger.info("Loading file as a 'raw' object")
        obj = data_file.read()
    elif file_type == "csv":
        logger.info("Loading file as a 'csv' object")
        import pandas as pd
        obj = pd.read_csv(data_file, **kwargs)
    elif file_type == "json":
        logger.info(
            f"loading file as a 'json' object. kwargs passed {kwargs!r}")
        obj = json.load(data_file, **kwargs)
    else:
        if hasattr(data_file, "close"):
            logger.info(f"Closing data_file {data_file!r}")
            data_file.close()
        raise ValueError(f"File type {file_type!r} is not supported")

    if hasattr(data_file, "close"):
        logger.info(f"Closing data_file {data_file!r}")
        data_file.close()

    return obj