예제 #1
0
def read(path, bucket=None, show_progressbar=True, *args, **kwargs):
    """
    Downloads an object from S3 and reads it into the Python session.
    Storage format is determined by file extension, to prevent
    extension-less files in S3.

    Args:
        path (str): The path of the file to read from in S3
        bucket (str, optional): The S3 bucket to search for the object in
        show_progresbar (bool, default True): Whether to show a progress bar
    Returns:
        object: The object downloaded from S3
    """
    path = s3_path_utils.clean_path(path)
    bucket = bucket or s3_path_utils.get_default_bucket()
    bucket = s3_path_utils.clean_bucket(bucket)

    filetype = s3_path_utils.get_filetype(path)
    read_fn = get_storage_fn(filetype, 'read')

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(path,
                                     bucket,
                                     operation='read',
                                     show_progressbar=show_progressbar)

    with NamedTemporaryFile(suffix='.' + filetype) as tmpfile:
        inform('Downloading from s3://{}/{}...'.format(bucket, path))
        s3.download_file(bucket, path, tmpfile.name, **s3_kwargs)
        inform('Reading from tempfile...')
        obj = read_fn(tmpfile, *args, **kwargs)
    return obj
예제 #2
0
def write(obj, path, bucket=None, show_progressbar=True, *args, **kwargs):
    """
    Writes an object to a specified file format and uploads it to S3.
    Storage format is determined by file extension, to prevent
    extension-less files in S3.

    Args:
        obj (object): The object to be uploaded to S3
        path (str): The path to save obj to
        bucket (str, optional): The S3 bucket to save 'obj' in
        show_progresbar (bool, default True): Whether to show a progress bar
    Returns:
        str: The full path to the object in S3, without the 's3://' prefix
    """
    path = s3_path_utils.clean_path(path)
    bucket = bucket or s3_path_utils.get_default_bucket()
    bucket = s3_path_utils.clean_bucket(bucket)

    filetype = s3_path_utils.get_filetype(path)
    write_fn = get_storage_fn(filetype, 'write')

    s3 = boto3.client('s3')

    with NamedTemporaryFile(suffix='.' + filetype) as tmpfile:
        inform('Writing object to tempfile...')
        write_fn(obj, tmpfile, *args, **kwargs)
        s3_kwargs = get_s3_client_kwargs(tmpfile.name,
                                         bucket,
                                         operation='write',
                                         show_progressbar=show_progressbar)
        inform('Uploading to s3://{}/{}...'.format(bucket, path))
        s3.upload_file(tmpfile.name, bucket, path, **s3_kwargs)

    return '/'.join([bucket, path])
예제 #3
0
def read_badpractice(path,
                     bucket=None,
                     filetype=None,
                     show_progressbar=True,
                     *args,
                     **kwargs):
    """
    Downloads an object from S3 and reads it into the Python session,
    without following the rules of the normal reading function.
    Storage format is determined by file extension, or as specified if the
    object is missing one.

    Although this tool aims to enforce good practice, sometimes it is necessary
    to work with other parties who may not follow the same practice, and this
    function allows for users to still read data from those parties
    Usage of this function for production-level code is strongly discouraged.

    Args:
        path (str): The path of the file to read from in S3
        bucket (str, optional): The S3 bucket to search for the object in
        filetype (str, optional):
            The filetype of the file being read. Can be used if a file was
            saved without a proper extension.
        show_progresbar (bool, default True): Whether to show a progress bar
    Returns:
        object: The object downloaded from S3
    """
    logging.warning('You are using river\'s read function that allows for '
                    'files stored with inadvisible S3 paths. It is highly '
                    'recommended that you use the standard \'read\' '
                    'function to ensure that good naming practices are '
                    'followed.')

    bucket = bucket or s3_path_utils.get_default_bucket()

    if filetype is None:
        filetype = s3_path_utils.get_filetype(path)

    read_fn = get_storage_fn(filetype, 'read')

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(path,
                                     bucket,
                                     operation='read',
                                     show_progressbar=show_progressbar)

    with NamedTemporaryFile(suffix='.' + filetype) as tmpfile:
        inform('Downloading from s3://{}/{}...'.format(bucket, path))
        s3.download_file(bucket, path, tmpfile.name, **s3_kwargs)
        inform('Reading object from tempfile...')
        obj = read_fn(tmpfile, *args, **kwargs)
    return obj
예제 #4
0
def upload_file(local_file_path, path, bucket=None, show_progressbar=True):
    """
    Uploads a file from local storage directly to S3

    Args:
        local_file_path (str): Location of the file to upload
        path (str): The key the file is to be stored under in S3
        bucket (str, optional): The S3 bucket to store the object in
        show_progresbar (bool, default True): Whether to show a progress bar
    """
    bucket = bucket or s3_path_utils.get_default_bucket()
    if local_file_path is None:
        raise ValueError('A local file location must be provided.')

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(local_file_path,
                                     bucket,
                                     operation='write',
                                     show_progressbar=show_progressbar)

    s3.upload_file(local_file_path, bucket, path, **s3_kwargs)
예제 #5
0
def copy(source_path,
         dest_path,
         source_bucket=None,
         dest_bucket=None,
         show_progressbar=True,
         clean_source_path=True):
    """
    Copy an object from one S3 location into another.

    Args:
        source_path (str): Path of file to copy
        dest_path (str): Path to copy file to
        source_bucket (str): Bucket of file to copy
        dest_bucket (str): Bucket to copy to
        show_progressbar (bool, default True): Whether to show a progress bar
    """
    source_bucket = source_bucket or s3_path_utils.get_default_bucket()
    dest_bucket = dest_bucket or s3_path_utils.get_default_bucket()

    if clean_source_path:
        source_path = s3_path_utils.clean_path(source_path)
    dest_path = s3_path_utils.clean_path(dest_path)

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(source_path,
                                     source_bucket,
                                     operation='copy',
                                     show_progressbar=show_progressbar)

    copy_source = {'Bucket': source_bucket, 'Key': source_path}

    inform("Copying object from s3://{}/{} to s3://{}/{}".format(
        source_bucket, source_path, dest_bucket, dest_path))
    s3.copy(CopySource=copy_source,
            Bucket=dest_bucket,
            Key=dest_path,
            **s3_kwargs)
예제 #6
0
def download_file(path,
                  bucket=None,
                  local_file_path=None,
                  show_progressbar=True):
    """
    Downloads a file from S3 directly to local storage

    Args:
        path (str): The key the file is under in S3
        bucket (str, optional): The S3 bucket to search for the object in
        local_file_path (str): Where to download the file to locally
        show_progresbar (bool, default True): Whether to show a progress bar
    """
    bucket = bucket or s3_path_utils.get_default_bucket()
    if local_file_path is None:
        raise ValueError('A local file path must be provided.')

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(path,
                                     bucket,
                                     operation='read',
                                     show_progressbar=show_progressbar)

    s3.download_file(bucket, path, local_file_path, **s3_kwargs)
예제 #7
0
def read_df_in_chunks(path,
                      bucket=None,
                      chunk_size=10000,
                      names=None,
                      show_progressbar=True,
                      *args,
                      **kwargs):
    """
    Downloads a DataFrame from S3 and reads it into the Python session in
    chunks of a specified number of rows.
    Because chunking is accomplished via splitting on rows, only textfiles
    (CSVs/PSVs) and compressed textfiles are currently supported.
    Support for reading Avro files in chunks may be added at a later date.
    Storage format is determined by file extension, to prevent
    extension-less files in S3.

    Args:
        path (str): The path of the file to read from in S3
        bucket (str, optional): The S3 bucket to search for the object in
        chunk_size (int, default 10000):
            The number of rows to read for each chunk
        names (list<str>, optional):
            Column names to apply to the chunk DataFrames. If left 'None',
            column names will be inferred by pandas.
        show_progresbar (bool, default True): Whether to show a progress bar
    Yields:
        df (pd.DataFrame):
            The chunk DataFrames read from the file downloaded from S3
    """
    row_chunkable_filetypes = [
        'csv', 'csv.gz', 'csv.zip', 'csv.bz2', 'csv.xz', 'psv', 'psv.gz',
        'psv.zip', 'psv.bz2', 'psv.xz'
    ]

    path = s3_path_utils.clean_path(path)
    bucket = bucket or s3_path_utils.get_default_bucket()
    bucket = s3_path_utils.clean_bucket(bucket)

    filetype = s3_path_utils.get_filetype(path)
    if filetype not in row_chunkable_filetypes:
        raise IOError(
            'Reading files in chunks is only supported with the following '
            'formats: ' + ','.join(row_chunkable_filetypes))
    read_fn = get_storage_fn(filetype, 'read')

    s3 = boto3.client('s3')
    s3_kwargs = get_s3_client_kwargs(path,
                                     bucket,
                                     operation='read',
                                     show_progressbar=show_progressbar)

    with NamedTemporaryFile(suffix='.' + filetype) as tmpfile:
        inform('Downloading from s3://{}/{}...'.format(bucket, path))
        s3.download_file(bucket, path, tmpfile.name, **s3_kwargs)
        df_chunker = read_fn(tmpfile,
                             names=names,
                             chunksize=chunk_size,
                             *args,
                             **kwargs)

        row_number = 0
        for chunk in df_chunker:
            row_number += 1
            inform('Reading from tempfile (chunk #{})...'.format(row_number))
            yield chunk