Exemplo n.º 1
0
    def test_alias_max_io_queue(self):
        ref_value = 10
        config = TransferConfig(max_io_queue=ref_value)
        self.assert_value_of_actual_and_alias(
            config, 'max_io_queue_size', 'max_io_queue', ref_value)

        # Set a new value using the alias
        new_value = 15
        config.max_io_queue = new_value
        # Make sure it sets the value for both the alias and the actual
        # value that will be used in the TransferManager
        self.assert_value_of_actual_and_alias(
            config, 'max_io_queue_size', 'max_io_queue', new_value)
Exemplo n.º 2
0
    return True

def check_head_object_works_for_client(s3_client, params):
    try:
        s3_client.head_object(**params)
    except ClientError as e:
        if e.response["Error"]["Code"] == "403":
            # This can also happen if you have full get_object access, but not list_objects_v2, and the object does not
            # exist. Instead of returning a 404, S3 will return a 403.
            return False
    return True



s3_transfer_config = TransferConfig()

# When uploading files at least this size, compare the ETags first and skip the upload if they're equal;
# copy the remote file onto itself if the metadata changes.
UPLOAD_ETAG_OPTIMIZATION_THRESHOLD = 1024


def _copy_local_file(ctx, size, src_path, dest_path):
    pathlib.Path(dest_path).parent.mkdir(parents=True, exist_ok=True)

    # TODO(dima): More detailed progress.
    shutil.copyfile(src_path, dest_path)
    ctx.progress(size)
    shutil.copymode(src_path, dest_path)

    ctx.done(PhysicalKey.from_path(dest_path))
Exemplo n.º 3
0
def s3_upload(bucket,
              local_filepath,
              s3_filepath,
              profile_name='default',
              region_name='us-west-2',
              multipart_threshold=8388608,
              multipart_chunksize=8388608):
    """ Uploads a file or collection of files to S3

    Parameters
    ----------
    bucket : str
        name of S3 bucket
    local_filepath : str or list
        path and filename(s) to be uploaded
    s3_filepath : str or list
        path and filename(s) within the bucket for the file to be uploaded
    region_name : str
        name of AWS region (default value 'us-west-2')
    profile_name : str
        profile name for credentials (default 'default' or organization-specific)
    multipart_threshold : int
        minimum file size to initiate multipart upload
    multipart_chunksize : int
        chunksize for multipart upload

    Returns
    -------
    None

    Example use
    -----------

    # Uploading a single file to S3:
    s3_upload(
        bucket='my_bucket',
        local_filepath='../data/my_file.csv',
        s3_filepath='tmp/my_file.csv')

    # Uploading with a profile name:
    s3_upload(
        bucket='my_bucket',
        profile_name='my-profile-name',
        local_filepath='../data/my_file.csv',
        s3_filepath='tmp/my_file.csv')

    Uploading a list of files to S3 (will not upload contents of subdirectories):
    s3_upload(
        bucket='my_bucket',
        local_filepath=['../data/my_file1.csv', '../data/my_file2.csv', '../img.png'],
        s3_filepath=['tmp/my_file1.csv', 'tmp/my_file2.csv', 'img.png'])

    Uploading files matching a pattern to S3 (will not upload contents of subdirectories):
    s3_upload(
        bucket='my_bucket',
        local_filepath='../data/*.csv',
        s3_filepath='tmp/')

    Uploading all files in a directory to S3 (will not upload contents of subdirectories):
    s3_upload(
        bucket='my_bucket',
        local_filepath='../data/*'
        s3_filepath='tmp/')
    """
    _download_upload_filepath_validator(s3_filepath=s3_filepath,
                                        local_filepath=local_filepath)
    my_bucket = s3_get_bucket(bucket=bucket,
                              profile_name=profile_name,
                              region_name=region_name)
    # multipart_threshold and multipart_chunksize, defaults = Amazon defaults
    config = TransferConfig(multipart_threshold=multipart_threshold,
                            multipart_chunksize=multipart_chunksize)
    if isinstance(local_filepath, str):
        if '*' in local_filepath:
            items = glob.glob(local_filepath)
            # filter out directories
            local_filepath = [item for item in items if os.path.isfile(item)]
            tmp_s3_filepath = [
                s3_filepath + f.split('/')[-1] for f in local_filepath
            ]
            s3_filepath = tmp_s3_filepath
        else:
            local_filepath = [local_filepath]
            s3_filepath = [s3_filepath]
    # upload all files to S3
    for local_file, s3_key in zip(local_filepath, s3_filepath):
        try:
            my_bucket.upload_file(local_file, s3_key, Config=config)
        except boto3.exceptions.S3UploadFailedError as e:
            raise S3UploadFailedError(str(e))
    return
Exemplo n.º 4
0
def remote_uri(uri, filename, action):
    """
    :param uri: uri of the container of the file

    :param filename: filename to act on

    :param action: must be one of [`up`, `down`, `list`, `del`]
    """
    if not re.match('\w+://\w+.*', uri):
        return uri

    tmp = uri.split('://')
    system = tmp[0]
    location = '://'.join(tmp[1:])

    if action not in ['down', 'up', 'list', 'del']:
        raise AttributeError(
            'remote_uri action attribute must be one of [`up`, `down`, `list`, `del`]'
        )

    if system == 's3':
        import boto3
        from boto3.s3.transfer import TransferConfig
        s3bucket = location.split('/')[0]
        s3connection = boto3.resource('s3')
        s3filename = '/'.join(location.split('/')[1:])

        if action == 'list':
            printd('Listing %s' % (uri), topic='s3')
            files = list(
                map(lambda x: x.key,
                    s3connection.Bucket(s3bucket).objects.all()))
            s3filename = s3filename.strip('/')
            if s3filename:
                files = filter(lambda x: x.startswith(s3filename), files)
            return files

        if action == 'del':
            if filename is None:
                filename = s3filename.split('/')[-1]
            printd('Deleting %s' % uri, topic='s3')
            s3connection.Object(s3bucket, s3filename).delete()

        elif action == 'down':
            if filename is None:
                filename = s3filename.split('/')[-1]
            printd('Downloading %s to %s' % (uri, filename), topic='s3')
            obj = s3connection.Object(s3bucket, s3filename)
            if not os.path.exists(os.path.abspath(os.path.split(filename)[0])):
                os.makedirs(os.path.abspath(os.path.split(filename)[0]))
            obj.download_file(filename,
                              Config=TransferConfig(use_threads=False))

        elif action == 'up':
            printd('Uploading %s to %s' % (filename, uri), topic='s3')
            from botocore.exceptions import ClientError
            if s3filename.endswith('/'):
                s3filename += filename.split('/')[-1]
            try:
                s3connection.meta.client.head_bucket(Bucket=s3bucket)
            except ClientError as _excp:
                # If a client error is thrown, then check that it was a 404 error.
                # If it was a 404 error, then the bucket does not exist.
                error_code = int(_excp.response['Error']['Code'])
                if error_code == 404:
                    s3connection.create_bucket(Bucket=s3bucket)
                else:
                    raise
            bucket = s3connection.Bucket(s3bucket)
            with open(filename, 'rb') as data:
                bucket.put_object(Key=s3filename,
                                  Body=data)  # , Metadata=meta)
Exemplo n.º 5
0
#!/usr/bin/env python

import argparse
import boto3
from boto3.s3.transfer import TransferConfig

parser = argparse.ArgumentParser()
parser.add_argument('-chunksize',
                    type=int,
                    default=1000,
                    help='size of each part in MB')
parser.add_argument('-sourcekey', help='source object location')
parser.add_argument('-targetkey', help='location to copy to')
args = parser.parse_args()

# Convert chunksize from MB to bytes
chunksize = args.chunksize * 1000000
transferConfig = TransferConfig(multipart_threshold=chunksize,
                                multipart_chunksize=chunksize)

s3 = boto3.resource('s3')
copy_source = {'Bucket': 'web-language-models', 'Key': args.sourcekey}
s3.meta.client.copy(copy_source,
                    'web-language-models',
                    args.targetkey,
                    Config=transferConfig)
Exemplo n.º 6
0
def download_fileobj(self,
                     Bucket,
                     Key,
                     Fileobj,
                     ExtraArgs=None,
                     Callback=None,
                     Config=None):
    """Download an object from S3 to a file-like object.

    The file-like object must be in binary mode.

    This is a managed transfer which will perform a multipart download in
    multiple threads if necessary.

    Usage::

        import boto3
        s3 = boto3.client('s3')

        with open('filename', 'wb') as data:
            s3.download_fileobj('mybucket', 'mykey', data)

    :type Fileobj: a file-like object
    :param Fileobj: A file-like object to download into. At a minimum, it must
        implement the `write` method and must accept bytes.

    :type Bucket: str
    :param Bucket: The name of the bucket to download from.

    :type Key: str
    :param Key: The name of the key to download from.

    :type ExtraArgs: dict
    :param ExtraArgs: Extra arguments that may be passed to the
        client operation.

    :type Callback: function
    :param Callback: A method which takes a number of bytes transferred to
        be periodically called during the download.

    :type Config: boto3.s3.transfer.TransferConfig
    :param Config: The transfer configuration to be used when performing the
        download.
    """
    if not hasattr(Fileobj, 'write'):
        raise ValueError('Fileobj must implement write')

    subscribers = None
    if Callback is not None:
        subscribers = [ProgressCallbackInvoker(Callback)]

    config = Config
    if config is None:
        config = TransferConfig()

    with create_transfer_manager(self, config) as manager:
        future = manager.download(bucket=Bucket,
                                  key=Key,
                                  fileobj=Fileobj,
                                  extra_args=ExtraArgs,
                                  subscribers=subscribers)
        return future.result()
Exemplo n.º 7
0
from ..debug import debug

try:
    # python2
    from urlparse import urlparse
except:
    # python3
    from urllib.parse import urlparse

from .s3util import get_s3_client, read_in_chunks, get_timestamp

try:
    import boto3
    from boto3.s3.transfer import TransferConfig

    DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
    DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1
    boto_found = True
except:
    boto_found = False


def ensure_unicode(x):
    return None if x is None else to_unicode(x)


S3GetObject = namedtuple_with_defaults("S3GetObject", "key offset length")

S3PutObject = namedtuple_with_defaults(
    "S3PutObject",
    "key value path content_type metadata",
Exemplo n.º 8
0
 def __init__(self, client, resource, config=None):
     self.client = client
     self.resource = resource
     if not config:
         config = TransferConfig(max_concurrency=1, use_threads=False)
     self.config = config
Exemplo n.º 9
0
import os
from urllib import parse
from botocore.client import Config
from botocore.exceptions import ClientError as S3ClientError
from boto3.s3.transfer import TransferConfig
import logging

# Define Environmental Variables
target_bucket = os.environ['destination_bucket']
my_max_pool_connections = int(os.environ['max_pool_connections'])
my_max_concurrency = int(os.environ['max_concurrency'])
my_multipart_chunksize = int(os.environ['multipart_chunksize'])
my_max_attempts = int(os.environ['max_attempts'])

# Set and Declare COnfiguration Parameters
transfer_config = TransferConfig(max_concurrency=my_max_concurrency,
                                 multipart_chunksize=my_multipart_chunksize)
config = Config(max_pool_connections=my_max_pool_connections,
                retries={'max_attempts': my_max_attempts})

# Instantiate S3Client
s3Client = boto3.resource('s3', config=config)

# # Set up logging
# logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

# Enable Verbose logging for Troubleshooting
# boto3.set_stream_logger("")

Exemplo n.º 10
0
        def run(self):
            while True and not self.shutdown_flag.is_set():
                if self.upload and self.upload_tries < 5:
                    self.upload_tries += 1
                else:
                    upload = self.upload_queue.get()

                self.upload = upload[0]
                self.expired = upload[1]
                if self.upload == "STOP":
                    if self.upload_queue.qsize() == 0:
                        self.upload_queue.put(["STOP", False])
                        self.shutdown_flag.set()
                        break
                    else:
                        self.progress_queue.put(None)
                        self.upload_tries = 0
                        self.upload = None
                        self.upload_queue.task_done()
                        upload = self.upload_queue.get()
                        self.upload = upload[0]
                        self.expired = upload[1]

                file_id, credentials, bucket, key, full_path, file_size = self.upload_config(
                )
                prefix = 'submission_{}'.format(self.submission_id)
                """
                Methods for  file transfer: 

                If the source file is local, use the credentials supplied by the submission API to upload from local 
                file to remote S3 location.

                If the source file is from S3, use a specific AWS Profile to retrieve the source file, and uses
                credentials supplied by the submission API to upload to remote S3 location.
                
                If the file was uploaded using multi-part, it will first complete the multi part uploads.
                """

                expired_error = False
                mpu_exist = False
                for upload in self.all_mpus:
                    if upload['Key'] == key:
                        mpu_exist = True
                        mpu_to_complete = upload
                        break

                if full_path.startswith('s3'):
                    """
                    Assumes you are uploading from external s3 bucket. SOURCE_BUCKET and SOURCE_PREFIX are hard-coded 
                    values, which specify where the object should be copied from (i.e., 100206 subject directory can be
                    located in s3://hcp-openaccess-temp, with a prefix of HCP_1200).

                    Creates source and destination clients for S3 tranfer. Use permanent credentials for accessing both 
                    buckets and accounts. This will require permission from NDA to write to NDA buckets. 

                    The transfer uses a file streaming method by streaming the body of the file into memory and uploads 
                    the stream in chunks using AWS S3Transfer. This Transfer client will automatically use multi part 
                    uploads when necessary. To maximize efficiency, only files greater than 8e6 bytes are uploaded using 
                    the multi part upload. Smaller files are uploaded in one part.  

                    After each successful transfer, the script will change the status of the file to complete in NDA's 
                    submission webservice. 

                    NOTE: For best results and to be cost effective, it is best to perform this file transfer in an AWS 
                    EC2 instance.
                    """

                    tqdm.monitor_interval = 0

                    source_session = boto3.Session(
                        aws_access_key_id=self.aws_access_key,
                        aws_secret_access_key=self.aws_secret_key)

                    config = Config(connect_timeout=240, read_timeout=240)
                    self.source_s3 = source_session.resource('s3',
                                                             config=config)

                    source_key = key.split('/')[1:]
                    source_key = '/'.join(source_key)
                    self.source_key = '/'.join(
                        [self.source_prefix, source_key])
                    self.fileobj = self.source_s3.Object(
                        self.source_bucket,
                        self.source_key).get()['Body']  # file stream
                    # self.bytes = self.source_s3.Object(self.source_bucket, self.source_key).get()['ContentLength']

                    if mpu_exist:
                        u = UploadMultiParts(mpu_to_complete,
                                             self.full_file_path, bucket,
                                             prefix, self.config, credentials)
                        u.get_parts_information()
                        if not self.expired:
                            self.progress_queue.put(u.completed_bytes)
                        seq = 1

                        for buffer in self.fileobj.iter_chunks(
                                chunk_size=u.chunk_size):
                            if seq in u.parts_completed:
                                part = u.parts[seq - 1]
                                u.check_md5(part, buffer)
                            else:
                                try:
                                    u.upload_part(buffer, seq)
                                    self.progress_queue.put(len(buffer))
                                    # upload missing part
                                except Exception as error:
                                    e = str(error)
                                    if "ExpiredToken" in e:
                                        self.add_back_to_queue(bucket, prefix)
                                        expired_error = True
                                    else:
                                        raise error
                            seq += 1
                        if not expired_error:
                            u.complete()
                        self.progress_queue.put(None)

                    else:

                        dest_session = boto3.Session(
                            aws_access_key_id=credentials['access_key'],
                            aws_secret_access_key=credentials['secret_key'],
                            aws_session_token=credentials['session_token'],
                            region_name='us-east-1')

                        #GB = 1024 ** 3
                        config = TransferConfig(multipart_threshold=8 * 1024 *
                                                1024)
                        self.dest = dest_session.client('s3')
                        self.dest_bucket = bucket
                        self.dest_key = key
                        self.temp_key = self.dest_key + '.temp'

                        try:
                            self.dest.upload_fileobj(
                                self.fileobj,
                                self.dest_bucket,
                                self.dest_key,
                                Callback=self.UpdateProgress(
                                    self.progress_queue),
                                Config=config  # ,
                                # ExtraArgs={"Metadata": {"ContentLength": self.bytes}}
                            )
                        except boto3.exceptions.S3UploadFailedError as error:
                            e = str(error)
                            if "ExpiredToken" in e:
                                self.add_back_to_queue(bucket, prefix)
                            else:
                                raise error
                    self.progress_queue.put(None)

                else:
                    """
                    Assumes the file is being uploaded from local file system
                    """

                    if mpu_exist:
                        u = UploadMultiParts(mpu_to_complete,
                                             self.full_file_path, bucket,
                                             prefix, self.config, credentials)
                        u.get_parts_information()
                        if not self.expired:
                            self.progress_queue.put(u.completed_bytes)
                        seq = 1

                        with open(full_path, 'rb') as f:
                            while True:
                                buffer_start = u.chunk_size * (seq - 1)
                                f.seek(buffer_start)
                                buffer = f.read(u.chunk_size)
                                if len(buffer) == 0:  # EOF
                                    break
                                if seq in u.parts_completed:
                                    part = u.parts[seq - 1]
                                    u.check_md5(part, buffer)
                                else:
                                    try:
                                        u.upload_part(buffer, seq)
                                        self.progress_queue.put(len(buffer))
                                    except Exception as error:
                                        e = str(error)
                                        if "ExpiredToken" in e:
                                            self.add_back_to_queue(
                                                bucket, prefix)
                                            expired_error = True
                                            break
                                        else:
                                            raise error

                                seq += 1
                        if not expired_error:
                            u.complete()
                        self.progress_queue.put(None)
                    else:
                        if credentials:
                            session = boto3.session.Session(
                                aws_access_key_id=credentials['access_key'],
                                aws_secret_access_key=credentials[
                                    'secret_key'],
                                aws_session_token=credentials['session_token'],
                                region_name='us-east-1')
                            s3 = session.client('s3')
                            config = TransferConfig(multipart_threshold=8 *
                                                    1024 * 1024,
                                                    max_concurrency=2,
                                                    num_download_attempts=10)

                            s3_transfer = S3Transfer(s3, config)
                            tqdm.monitor_interval = 0
                            try:
                                s3_transfer.upload_file(
                                    full_path,
                                    bucket,
                                    key,
                                    callback=self.UpdateProgress(
                                        self.progress_queue))
                            except boto3.exceptions.S3UploadFailedError as error:
                                e = str(error)
                                if "ExpiredToken" in e:
                                    self.add_back_to_queue(bucket, prefix)
                                else:
                                    raise error

                            self.progress_queue.put(None)

                        else:
                            print(
                                'There was an error uploading {} after {} retry attempts'
                                .format(full_path, self.upload_tries))
                            continue

                self.upload_tries = 0
                self.upload = None
                self.upload_queue.task_done()
Exemplo n.º 11
0
import argparse
from botocore.exceptions import ClientError
from boto3.s3.transfer import TransferConfig
import zipfile
import shutil
import threading
from queue import Queue
import time

s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')
data_dir = '/tmp/'

config = TransferConfig(multipart_threshold=1048576,
                        multipart_chunksize=1048576,
                        max_concurrency=10,
                        num_download_attempts=10,
                        use_threads=True)

print_lock = threading.Lock()


def parse_args():
    parser = argparse.ArgumentParser(description="S3 compressor")
    parser.add_argument('--arch_bucket',
                        help='Destination Archive bucket',
                        required=True)
    parser.add_argument('--bucket', help='Source bucket', required=True)
    parser.add_argument('--prefix', default='sources', help='prefix')
    parser.add_argument('--years',
                        help='years. ex: --years 2016,2017',
import json
#import shutil
import glob
import logging
import platform
import threading
from inspect import signature
#import traceback
import subprocess
import pandas as pd
#import numpy as np      # imported only for np.nan
import boto3
from botocore.exceptions import ClientError
from boto3.s3.transfer import TransferConfig

s3_config = TransferConfig(max_concurrency=20, use_threads=True)
logging.basicConfig(
    level=logging.INFO, format='%(asctime)-12s %(levelname)-8s %(message)s')


#from models.BIF import BIF
from utilities import utils, logs

# some basic utilities for use with s3 storage and interaction with lambdas.

SimulateAWS = False      
# if this is true, then we do not start EC2 instance, and we use local storage to messages
# to second python process all running locally to debug the communication code.
# Run -op update_archives with this set, and the code will do the following:
#   1. uploading has been separately tested, so we can use sample zip file already on resources/s3sim folder.
#   2. ec2sim is set up to match situation that would exist in linux environment.
Exemplo n.º 13
0
    "clips-video",  # for youtube-dl etc.
    "exports",
    "films",  # for films referenced
    "gallery",  # for DaVinci
    "live-audio",
    "live-video",
    "music",
    "stills",
    "table-reads",
]

MULTIPART_THRESHOLD = 1024 * 1024 * 100  # 100mb
MULTIPART_CHUNKSIZE = 1024 * 1024 * 100  # 100mb
TRANSFER_CONFIG = TransferConfig(
    multipart_threshold=MULTIPART_THRESHOLD,
    max_concurrency=10,
    multipart_chunksize=MULTIPART_CHUNKSIZE,
    use_threads=True,
)

SKIP_NONE = None
SKIP_ETAG = "etag"
SKIP_SIZE = "size"
SKIP_LMOD = "lmod"
SKIP_REGX = "regx"


def _md5(filepath, blocksize=2**20):
    m = hashlib.md5()
    with open(filepath, "rb") as f:
        while True:
            buf = f.read(blocksize)
Exemplo n.º 14
0
import boto3
from boto3.s3.transfer import TransferConfig

GB = 1024 ** 3
config = TransferConfig(multipart_threshold=5*GB)

s3 = boto3.client('s3')
s3.upload_file('/home/aakash/Videos/DontBreatheHD.mp4', 'multipart-aakash', 'DontBreatheHD.mp4', Config=config)
list = s3.list_buckets()
print(list)
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        prog='cta-data-relay',
        description='',
        formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(
            prog, max_help_position=27, width=90))
    actions_grp = parser.add_argument_group(
        title='Actions', description='(exactly one must be specified)')
    actions_mxgrp = actions_grp.add_mutually_exclusive_group(required=True)
    actions_mxgrp.add_argument('--local-to-s3',
                               action='store_true',
                               help='Upload local files to S3 storage')
    actions_mxgrp.add_argument('--s3-to-gridftp',
                               action='store_true',
                               help='Move files from S3 to gridftp storage')
    actions_mxgrp.add_argument('--meta-show',
                               action='store_true',
                               help='Show S3 metadata')
    actions_mxgrp.add_argument('--meta-vs-gridftp',
                               action='store_true',
                               help='Compare S3 metadata vs gridftp storage')
    actions_mxgrp.add_argument('--meta-vs-local',
                               action='store_true',
                               help='Compare S3 metadata vs local storage')
    actions_mxgrp.add_argument('--meta-set-gridftp',
                               action='store_true',
                               help='Set S3 metadata to match gridftp storage')
    actions_mxgrp.add_argument(
        '--meta-prune-to-gridftp',
        action='store_true',
        help='Prune from S3 metadata files not in gridftp')

    misc_grp = parser.add_argument_group('Miscellaneous options')
    misc_grp.add_argument('--local-path',
                          metavar='PATH',
                          help='local source file or directory')
    misc_grp.add_argument('--timeout',
                          metavar='SECONDS',
                          type=int,
                          help='terminate after this amount of time')
    misc_grp.add_argument('--tempdir',
                          metavar='PATH',
                          default='/tmp',
                          help='directory for (de)compression')
    misc_grp.add_argument('--dry-run',
                          default=False,
                          action='store_true',
                          help='dry run')

    s3_grp = parser.add_argument_group('S3 options')
    s3_grp.add_argument('--s3-url',
                        metavar='URL',
                        default='https://rgw.icecube.wisc.edu',
                        help='S3 endpoint URL')
    s3_grp.add_argument('-b',
                        '--bucket',
                        metavar='NAME',
                        required=True,
                        help='S3 bucket name')
    s3_grp.add_argument('-i', dest='access_key_id', help='S3 access key id')
    s3_grp.add_argument('-k',
                        dest='secret_access_key',
                        help='S3 secret access key')
    s3_grp.add_argument('--s3-threads',
                        metavar='NUM',
                        type=int,
                        default=80,
                        help='maximum number of S3 transfer threads')
    s3_grp.add_argument('--object',
                        metavar='KEY',
                        help='operate on specific S3 object only')
    s3_grp.add_argument('--s3-stats-freq',
                        metavar='SEC',
                        default=20,
                        type=int,
                        help='frequency of S3 upload progress updates')

    grid_grp = parser.add_argument_group('GridFTP options')
    grid_grp.add_argument('--gridftp-url',
                          metavar='URL',
                          default='gsiftp://gridftp.icecube.wisc.edu',
                          help='GridFTP endpoint URL')
    grid_grp.add_argument('--gridftp-path',
                          metavar='PATH',
                          help='GridFTP path')
    grid_grp.add_argument('--gridftp-threads',
                          metavar='NUM',
                          type=int,
                          default=45,
                          help='gridftp worker pool size')

    args = parser.parse_args()
    if args.timeout:
        signal.alarm(args.timeout)
    if not os.path.isdir(args.tempdir):
        parser.exit(f'Invalid argument: {args.tempdir} is not a directory')

    s3 = boto3.resource('s3',
                        'us-east-1',
                        endpoint_url=args.s3_url,
                        aws_access_key_id=args.access_key_id,
                        aws_secret_access_key=args.secret_access_key)
    bucket = s3.Bucket(args.bucket)
    bucket.create()

    compr_threads = max(1, int(os.cpu_count() / 2))
    multipart_size = 2**20

    if args.local_to_s3:
        import cta_data_relay.s3zstd
        tx_config = TransferConfig(max_concurrency=args.s3_threads,
                                   multipart_threshold=multipart_size,
                                   multipart_chunksize=multipart_size)
        if args.local_path is None:
            parser.exit(f'Missing required argument --local-path')
        if os.path.isfile(args.local_path):
            file_info = [(args.local_path, os.path.getsize(args.local_path))]
        else:
            file_info = [(de.path, de.stat().st_size)
                         for de in os.scandir(args.local_path) if de.is_file()]
            # Sort to send small files first. This avoids the situation where
            # a file that is too big to be transferred within the allowed time
            # permanently blocks files that follow it in the list
            file_info.sort(key=itemgetter(1))
        cta_data_relay.s3zstd.zupload(bucket, file_info, args.tempdir,
                                      compr_threads, tx_config,
                                      args.s3_stats_freq, args.dry_run)
    elif args.s3_to_gridftp:
        if args.gridftp_path is None:
            parser.exit(f'Missing required argument --gridftp-path')
        s3_to_gridftp(bucket, args.gridftp_url, args.gridftp_path,
                      args.tempdir, args.object, args.dry_run)
    elif args.meta_set_gridftp:
        import cta_data_relay.meta
        if args.gridftp_path is None:
            parser.exit(f'Missing required argument --gridftp-path')
        cta_data_relay.meta.set_gridftp(bucket, args.gridftp_url,
                                        args.gridftp_path,
                                        args.gridftp_threads, args.dry_run)
    elif args.meta_show:
        import cta_data_relay.meta
        cta_data_relay.meta.show(bucket, args.object)
    elif args.meta_vs_gridftp:
        import cta_data_relay.meta
        if args.gridftp_path is None:
            parser.exit(f'Missing required argument --gridftp-path')
        cta_data_relay.meta.diff_gridftp(bucket, args.gridftp_url,
                                         args.gridftp_path,
                                         args.gridftp_threads, args.dry_run)
    elif args.meta_vs_local:
        import cta_data_relay.meta
        if args.local_path is None:
            parser.exit(f'Missing required argument --local-path')
        cta_data_relay.meta.diff_local(bucket, args.local_path)
    elif args.meta_prune_to_gridftp:
        import cta_data_relay.meta
        if args.gridftp_path is None:
            parser.exit(f'Missing required argument --gridftp-path')
        cta_data_relay.meta.prune_not_in_gridftp(bucket, args.gridftp_url,
                                                 args.gridftp_path,
                                                 args.dry_run)
    else:
        parser.exit('Usage error. Unexpected command.')
Exemplo n.º 16
0
def download_process_data_local(start_date,
                                bands,
                                end_date,
                                aws_key,
                                aws_secret,
                                aws_bucket_name,
                                delta,
                                zip_grib=False,
                                chunksize=5,
                                retries=5,
                                max_workers=10):
    """
    Download individual month directory of .grd files to local directory.

    This function will download using the ftplib all the .grd files between the
    start_date and the end_date. All dates in the NOAA NARR FTP server are
    stored following this order:
        data
        ├── year/month
            ├── year/month/day01
            ├── year/month/day02

    Here we download the monthly directory with the user-defined dates in the
    start and end dates. 

    Params:
        - start_year str: year to start download.
        - end_year str: year to stop download.
    """
    logger = logging.getLogger('luigi-interface')
    GB = 1024**3

    session = boto3.Session(profile_name='default')
    s3 = session.client('s3')
    config = TransferConfig(multipart_threshold=5 * GB)

    base_url = 'https://nomads.ncdc.noaa.gov/data/narr'
    time = ['0000', '0300', '0600', '0900', '1200', '1500', '1800', '2100']

    if end_date is None:
        end_date = start_date + relativedelta(**{'months': 1})

    if not isinstance(start_date, datetime) and isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    else:
        ValueError(
            f'{start_date} is not in the correct format or not a valid type')

    if delta is None:
        dates = datetime_range(start_date, end_date, {'days': 1})
    else:
        dates = datetime_range(start_date, end_date + delta)

    urls_time_range = []
    for day, time in product(dates, time):
        file_name = f'narr-a_221_{day.strftime("%Y%m%d")}_{time}_000.grb'
        url = URL(base_url, day.strftime('%Y%m'), day.strftime('%Y%m%d'))
        urls_time_range.append(str(URL(url, file_name)))

    with multiprocessing.Pool(max_workers) as p:
        results = p.map(partial(requests_to_s3, retries=retries),
                        urls_time_range,
                        chunksize=chunksize)

        if zip_grib:
            logger.info(
                f'Finish download for start_date {start_date.strftime("%Y-%m")}'
            )
            temp_dir_grb = mkdtemp()
            temp_file_grb = NamedTemporaryFile()
            path_to_temp_file = os.path.join(temp_dir_grb,
                                             f'{temp_file_grb.name}.zip')
            with zipfile.ZipFile(path_to_temp_file,
                                 mode='w',
                                 compression=zipfile.ZIP_DEFLATED,
                                 compresslevel=1) as zf:
                for content_file_name, content_file_result in results:
                    try:
                        zf.writestr(content_file_name, content_file_result)
                    except Exception as exc:
                        logger.info(exc)

        else:
            path_to_temp_file = mkdtemp()
            for content_file_name, content_file_result in results:
                with open(os.path.join(path_to_temp_file, content_file_name),
                          'wb') as grb_file:
                    grb_file.write(content_file_result)

        temp_dir_geo = mkdtemp()
        logger.info(
            f'Transforming GRIB to GeoTIFF using GDAL [{start_date.strftime("%Y-%m")}]'
        )
        gdal_transform_tempfile(temp_file_path=path_to_temp_file,
                                out_dir=temp_dir_geo,
                                bands=bands,
                                zip_grib=zip_grib)

        try:
            logger.info(
                f'Zipping GEOTiffs files and packing to upload [{start_date.strftime("%Y-%m")}]'
            )
            temp_file_geo = NamedTemporaryFile()
            path_geotiffs = Path(temp_dir_geo).rglob('*.tif')
            with zipfile.ZipFile(f'{temp_file_geo.name}.zip',
                                 mode='w',
                                 compression=zipfile.ZIP_DEFLATED,
                                 compresslevel=1) as zip_geo:
                for geo_file in path_geotiffs:
                    zip_geo.write(geo_file, geo_file.name)

            logger.info(
                f'Finish zipping  - Starting upload to S3 [{start_date.strftime("%Y-%m")}]'
            )
            key = f"processed_geotiff_wind/narr_data_{start_date.strftime('%Y_%m')}.zip"
            s3.upload_file(f'{temp_file_geo.name}.zip',
                           aws_bucket_name,
                           key,
                           Config=config)

        except Exception as exc:
            logger.info(exc)

        shutil.rmtree(temp_dir_geo)
        shutil.rmtree(path_to_temp_file)
        os.remove(f'{temp_file_geo.name}.zip')
Exemplo n.º 17
0
def upload_site(directory, config):
    if isinstance(directory, str):
        directory = Path(directory)
    if not config.get("name"):
        try:
            repo = _find_git_repo(directory)
        except NoGitDirectory:
            raise NoGitDirectory(
                f"From {directory} can't find its git root directory "
                "which is needed to supply a default branchname.")
        active_branch = repo.active_branch
        config["name"] = DEFAULT_NAME_PATTERN.format(
            username=getpass.getuser(),
            branchname=active_branch.name,
            date=datetime.datetime.utcnow().strftime("%Y%m%d"),
        )
    info(f"About to upload {ppath(directory)} to {config['name']}")

    session = boto3.Session(profile_name=AWS_PROFILE)
    s3 = session.client("s3")

    # First make sure the bucket exists
    try:
        s3.head_bucket(Bucket=config["name"])
    except ClientError as error:
        # If a client error is thrown, then check that it was a 404 error.
        # If it was a 404 error, then the bucket does not exist.
        if error.response["Error"]["Code"] != "404":
            raise

        # Needs to be created.
        bucket_config = {}
        if S3_DEFAULT_BUCKET_LOCATION:
            bucket_config["LocationConstraint"] = "us-west-1"
        s3.create_bucket(
            Bucket=config["name"],
            ACL="public-read",
            CreateBucketConfiguration=bucket_config,
        )

    try:
        website_bucket = s3.get_bucket_website(Bucket=config["name"])
    except ClientError as error:
        if error.response["Error"]["Code"] != "NoSuchWebsiteConfiguration":
            raise
        # Define the website configuration
        website_configuration = {
            "ErrorDocument": {
                "Key": "error.html"
            },
            "IndexDocument": {
                "Suffix": "index.html"
            },
        }
        website_bucket = s3.put_bucket_website(
            Bucket=config["name"],
            WebsiteConfiguration=website_configuration,
            # XXX Would be nice to set expiration here
        )
        info(f"Created website bucket called {config['name']}")

    if config["debug"]:
        info(f"Website bucket: {website_bucket!r}")

    uploaded_already = {}

    if config["refresh"]:
        info("Refresh, so ignoring what was previously uploaded.")
    else:
        continuation_token = None
        while True:
            # Have to do this so that 'ContinuationToken' can be omitted if falsy
            list_kwargs = dict(Bucket=config["name"])
            if continuation_token:
                list_kwargs["ContinuationToken"] = continuation_token
            response = s3.list_objects_v2(**list_kwargs)
            for obj in response.get("Contents", []):
                uploaded_already[obj["Key"]] = obj
            if response["IsTruncated"]:
                continuation_token = response["NextContinuationToken"]
            else:
                break

        warning(f"{len(uploaded_already):,} files already uploaded.")

    transfer_config = TransferConfig()
    skipped = []

    to_upload_maybe = []
    to_upload_definitely = []
    for fp in directory.glob("**/*.*"):
        key = str(fp.relative_to(directory))
        # name = str(fp)
        size = os.stat(fp).st_size
        with open(fp, "rb") as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
        task = UploadTask(key, str(fp), size, file_hash)
        if is_junk_file(fp):
            skipped.append(task)
            continue

        if key not in uploaded_already or uploaded_already[key]["Size"] != size:
            # No doubt! We definitely didn't have this before or it's definitely
            # different.
            to_upload_definitely.append(task)
        else:
            # At this point, the key exists and the size hasn't changed.
            # However, for some files, that's not conclusive.
            # Image, a 'index.html' file might have this as its diff:
            #
            #    - <script src=/foo.a9bef19a0.js></script>
            #    + <script src=/foo.3e98ca01d.js></script>
            #
            # ...which means it definitely has changed but the file size is
            # exactly the same as before.
            # If this is the case, we're going to *maybe* upload it.
            # However, for files that are already digest hashed, we don't need
            # to bother checking.
            if _has_hashed_filename(key):
                skipped.append(task)
            else:
                to_upload_maybe.append(task)

    T0 = time.time()
    futures = {}
    total_threadpool_time = []
    uploaded = {}
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=MAX_WORKERS_PARALLEL_UPLOADS) as executor:

        if to_upload_maybe:
            info("About to consider " f"{len(to_upload_maybe):,} files")
        if to_upload_definitely:
            info("About to upload " f"{len(to_upload_definitely):,} files")

        bucket_name = config["name"]
        for list_, check_hash_first in (
            (to_upload_definitely, False),
            (to_upload_maybe, True),
        ):
            for task in list_:
                futures[executor.submit(
                    _upload_file_maybe,
                    s3,
                    task,
                    bucket_name,
                    transfer_config,
                    check_hash_first,
                )] = task

        for future in concurrent.futures.as_completed(futures):
            was_uploaded, took = future.result()
            task = futures[future]
            uploaded[task] = (was_uploaded, took)
            total_threadpool_time.append(took)

    T1 = time.time()

    actually_uploaded = [k for k, v in uploaded.items() if v[0]]
    actually_skipped = [k for k, v in uploaded.items() if not v[0]]

    if skipped or actually_skipped:
        warning(
            f"Skipped uploading {len(skipped) + len(actually_skipped):,} files"
        )

    if uploaded:
        if actually_uploaded:
            total_uploaded_size = sum([x.size for x in actually_uploaded])
            success(f"Uploaded {len(actually_uploaded):,} "
                    f"{'file' if len(actually_uploaded) == 1 else 'files'} "
                    f"(totalling {fmt_size(total_uploaded_size)}) "
                    f"(~{fmt_size(total_uploaded_size / 60)}/s)")

        if total_threadpool_time:
            info("Sum of time to upload in thread pool "
                 f"{fmt_seconds(sum(total_threadpool_time))}")

    success(f"Done in {fmt_seconds(T1 - T0)}")

    return {"uploaded": uploaded, "skipped": skipped, "took": T1 - T0}
Exemplo n.º 18
0
# Threads are used by default in the managed transfer methods. To ensure no threads are used in the transfer process,
# set use_threads to False. Note that in setting use_threads to False, the value for max_concurrency is ignored as the
# main thread will only ever be used:

import boto3
from boto3.s3.transfer import TransferConfig
s3_bucket = "test-uala"
s3_file="tmp.txt.large"

# Get the service client
s3 = boto3.client('s3')

# Ensure that no threads are used.
config = TransferConfig(use_threads=False)

# Download object at s3_bucket with key-name to tmp.txt with the
# set configuration
s3.download_file(s3_bucket, s3_file, s3_file, Config=config)
Exemplo n.º 19
0
    def execute(self):
        """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could
        model that API more precisely in future.

        :return: An iterable of the records fetched
        """

        # print("Executing select_object_content")

        self.timer.start()

        if not self.need_s3select:
            proj_dir = os.environ['PYTHONPATH'].split(":")[0]
            table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC)
            if not os.path.exists(table_loc):
                os.makedirs(table_loc)

            self.table_local_file_path = os.path.join(table_loc, self.s3key)

            if not os.path.exists(
                    self.table_local_file_path) or not USE_CACHED_TABLES:
                config = TransferConfig(multipart_chunksize=8 * MB,
                                        multipart_threshold=8 * MB)

                self.table_data = io.BytesIO()

                self.s3.download_fileobj(Bucket=S3_BUCKET_NAME,
                                         Key=self.s3key,
                                         Fileobj=self.table_data,
                                         Config=config)

                self.num_http_get_requests = Cursor.calculate_num_http_requests(
                    self.table_data, config)

            return self.parse_file()
        else:
            # Note:
            #
            # CSV files use | as a delimiter and have a trailing delimiter so record delimiter is |\n
            #
            # NOTE: As responses are chunked the file headers are only returned in the first chunk.
            # We ignore them for now just because its simpler. It does mean the records are returned as a list
            #  instead of a dict though (can change in future).
            #
            response = self.s3.select_object_content(
                Bucket=S3_BUCKET_NAME,
                Key=self.s3key,
                ExpressionType='SQL',
                Expression=self.s3sql,
                InputSerialization={
                    'CSV': {
                        'FileHeaderInfo': 'Use',
                        'RecordDelimiter': '|\n',
                        'FieldDelimiter': '|'
                    }
                },
                OutputSerialization={'CSV': {}})

            self.event_stream = response['Payload']

            self.num_http_get_requests = 1

            return self.parse_event_stream()
Exemplo n.º 20
0
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    credentials = {'aws_access_key_id': os.getenv('AWS_SERVER_PUBLIC_KEY'),
                   'aws_secret_access_key': os.getenv('AWS_SERVER_SECRET_KEY'),
                   'region_name': os.getenv('REGION_NAME')
                   }

    # Upload the file
    s3_client = boto3.client('s3', **credentials, config=Config(signature_version='s3v4'))

    transfer_config = TransferConfig(multipart_threshold=1024 * 25,
                                     max_concurrency=10,
                                     multipart_chunksize=1024 * 25,
                                     use_threads=True)

    try:
        # print('aaa', file_name)
        print(bcolors.WARNING + 'Start upload: ' + bcolors.ENDC + os.path.split(file_name)[1])
        print(bcolors.WARNING + 'Renamed to:   ' + bcolors.ENDC + os.path.split(object_name)[1])
        response = s3_client.upload_file(file_name, bucket, object_name,
                                         ExtraArgs={'ACL': 'public-read'},
                                         Config=transfer_config,
                                         Callback=ProgressPercentage(file_name)
                                         )
    except ClientError as e:
        print('Error!!!')
        print(e)
        return False

    # https://stackoverflow.com/questions/33809592/upload-to-amazon-s3-using-boto3-and-return-public-url

    # generate link 1
    # link = s3_client.generate_presigned_url('get_object', ExpiresIn=7776000, Params={'Bucket': S3_Bucket, 'Key': object_name})

    # generate link 2
    # import boto3
    # s3_client = boto3.client
    # bucket_location = s3_client.get_bucket_location(Bucket='my_bucket_name')
    # url = "https://s3.{0}.amazonaws.com/{1}/{2}".format(bucket_location['LocationConstraint'], 'my_bucket_name',
    #                                                     quote_plus('2018-11-26 16:34:48.351890+09:00.jpg')
    # print(url)

    # generate link 3
    link = '%s/%s/%s' % (s3_client.meta.endpoint_url, bucket, object_name)

    print('\n' + bcolors.WARNING + 'File link:    ' + link + bcolors.ENDC)

    pyperclip.copy(link)
    spam = pyperclip.paste()
    print('\n' + bcolors.OKGREEN + 'Ссылка скопирована в буфер обмена.')

    return True
Exemplo n.º 21
0
 def __init__(self, s3_client: boto3.session.Session.client) -> None:
     megabyte = 1024**2
     gigabyte = 1024**3
     self.s3_client = s3_client
     self.s3_config = TransferConfig(io_chunksize=4 * megabyte,
                                     multipart_threshold=4 * gigabyte)
Exemplo n.º 22
0
def copy(self, CopySource, Bucket, Key, ExtraArgs=None, Callback=None,
         SourceClient=None, Config=None):
    """Copy an object from one S3 location to another.

    This is a managed transfer which will perform a multipart copy in
    multiple threads if necessary.

    Usage::

        import boto3
        s3 = boto3.resource('s3')
        copy_source = {
            'Bucket': 'mybucket',
            'Key': 'mykey'
        }
        s3.meta.client.copy(copy_source, 'otherbucket', 'otherkey')

    :type CopySource: dict
    :param CopySource: The name of the source bucket, key name of the
        source object, and optional version ID of the source object. The
        dictionary format is:
        ``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note
        that the ``VersionId`` key is optional and may be omitted.

    :type Bucket: str
    :param Bucket: The name of the bucket to copy to

    :type Key: str
    :param Key: The name of the key to copy to

    :type ExtraArgs: dict
    :param ExtraArgs: Extra arguments that may be passed to the
        client operation

    :type Callback: method
    :param Callback: A method which takes a number of bytes transferred to
        be periodically called during the copy.

    :type SourceClient: botocore or boto3 Client
    :param SourceClient: The client to be used for operation that
        may happen at the source object. For example, this client is
        used for the head_object that determines the size of the copy.
        If no client is provided, the current client is used as the client
        for the source object.

    :type Config: boto3.s3.transfer.TransferConfig
    :param Config: The transfer configuration to be used when performing the
        copy.
    """
    subscribers = None
    if Callback is not None:
        subscribers = [ProgressCallbackInvoker(Callback)]

    config = Config
    if config is None:
        config = TransferConfig()

    with create_transfer_manager(self, config) as manager:
        future = manager.copy(
            copy_source=CopySource, bucket=Bucket, key=Key,
            extra_args=ExtraArgs, subscribers=subscribers,
            source_client=SourceClient)
        return future.result()
Exemplo n.º 23
0
    def zip_campaign_files():  # pylint: disable=too-many-locals
        """Archive and publish all test campaign data to the S3 repository.

        It allows collecting all the artifacts from the S3 repository.

        It could be overriden if the common implementation is not
        suitable.

        The credentials must be configured before publishing the artifacts:

            * fill ~/.aws/credentials or ~/.boto,
            * set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY in env.

        The next vars must be set in env:

            * S3_ENDPOINT_URL (http://127.0.0.1:9000),
            * S3_DST_URL (s3://xtesting/prefix),

        Returns:
            Campaign.EX_OK if artifacts were published to repository.
            Campaign.EX_DUMP_ARTIFACTS_ERROR otherwise.
        """
        try:
            build_tag = env.get('BUILD_TAG')
            assert Campaign.dump_db() == Campaign.EX_OK
            assert Campaign.dump_artifacts() == Campaign.EX_OK
            with zipfile.ZipFile(f'{build_tag}.zip', 'w',
                                 zipfile.ZIP_DEFLATED) as zfile:
                zfile.write(f"{build_tag}.json")
                for root, _, files in os.walk(build_tag):
                    for filename in files:
                        zfile.write(os.path.join(root, filename))
            b3resource = boto3.resource(
                's3', endpoint_url=os.environ["S3_ENDPOINT_URL"])
            dst_s3_url = os.environ["S3_DST_URL"]
            multipart_threshold = 5 * 1024**5 if "google" in os.environ[
                "S3_ENDPOINT_URL"] else 8 * 1024 * 1024
            tconfig = TransferConfig(multipart_threshold=multipart_threshold)
            bucket_name = urllib.parse.urlparse(dst_s3_url).netloc
            mime_type = mimetypes.guess_type(f'{build_tag}.zip')
            path = urllib.parse.urlparse(dst_s3_url).path.strip("/")
            # pylint: disable=no-member
            b3resource.Bucket(bucket_name).upload_file(
                f'{build_tag}.zip',
                os.path.join(path, f'{build_tag}.zip'),
                Config=tconfig,
                ExtraArgs={
                    'ContentType': mime_type[0] or 'application/octet-stream'
                })
            dst_http_url = os.environ["HTTP_DST_URL"]
            link = os.path.join(dst_http_url, f'{build_tag}.zip')
            Campaign.__logger.info(
                "All data were successfully published:\n\n%s", link)
            return Campaign.EX_OK
        except KeyError as ex:
            Campaign.__logger.error("Please check env var: %s", str(ex))
            return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR
        except botocore.exceptions.NoCredentialsError:
            Campaign.__logger.error(
                "Please fill ~/.aws/credentials, ~/.boto or set "
                "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY in env")
            return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR
        except Exception:  # pylint: disable=broad-except
            Campaign.__logger.exception("Cannot publish the artifacts")
            return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR
Exemplo n.º 24
0
from itertools import groupby
from operator import attrgetter

from . import _util
from .exceptions import BucketStorageUnavailableException
from .storage import Storage

# Max size in bytes before uploading in parts.

AWS_UPLOAD_MAX_SIZE = 8 * 1024 * 1024
# Size of parts when uploading in parts
AWS_UPLOAD_PART_SIZE = 8 * 1024 * 1024

s3_multipart_config = TransferConfig(
    multipart_threshold=AWS_UPLOAD_MAX_SIZE,
    multipart_chunksize=AWS_UPLOAD_PART_SIZE,
    max_concurrency=10,
    num_download_attempts=10,
)


class Bucket(Storage):
    """Represents a resource/result bucket.

    This class is the interface to manage resources or results from a
    :class:`qarnot.bucket.Bucket`.

    .. note::
       A :class:`Bucket` must be created with
       :meth:`qarnot.connection.Connection.create_bucket`
       or retrieved with :meth:`qarnot.connection.Connection.buckets`, :meth:`qarnot.connection.Connection.retrieve_bucket`,
       or :meth:`qarnot.connection.Connection.retrieve_or_create_bucket`.
Exemplo n.º 25
0
import logging
import boto3
from boto3.s3.transfer import TransferConfig
MB = 1024**2
GB = 1024**3
logging.basicConfig(format=f'%(asctime)s %(levelname)s %(message)s',
                    level=logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('S3upload')
logger.propagate = False
ch = logging.StreamHandler()
ch.setFormatter(formatter)
ch.setLevel(logging.ERROR)
logger.addHandler(ch)
CONFIG = TransferConfig(multipart_threshold=GB)


class ProgressTracker(object):
    def __init__(self):
        self._size = 0
        self._numfiles = 0
        self._seen_so_far = 0
        self.completed = 0
        self.failed = 0
        self._lock = threading.Lock()

    def trackfile(self, fpath):
        fpath = pathlib.Path(fpath)
        with self._lock:
            self._size += fpath.stat().st_size
Exemplo n.º 26
0
    def upload_file(
        data_file: str,
        meta: 'SnowflakeFileMeta',
        encryption_metadata: 'EncryptionMetadata',
        max_concurrency: int,
        multipart_threshold: int,
    ):
        """Uploads the local file to S3.

        Args:
            data_file: File path on local system.
            meta: The File meta object (contains credentials and remote location).
            encryption_metadata: Encryption metadata to be set on object.
            max_concurrency: The maximum number of threads to used to upload.
            multipart_threshold: The number of bytes after which size a file should be uploaded concurrently in chunks.

        Raises:
            HTTPError if some http errors occurred.

        Returns:
            None.
        """
        try:
            s3_metadata = {
                HTTP_HEADER_CONTENT_TYPE: HTTP_HEADER_VALUE_OCTET_STREAM,
                SFC_DIGEST: meta.sha256_digest,
            }
            if encryption_metadata:
                s3_metadata.update({
                    AMZ_IV: encryption_metadata.iv,
                    AMZ_KEY: encryption_metadata.key,
                    AMZ_MATDESC: encryption_metadata.matdesc,
                })
            s3location = SnowflakeS3Util.extract_bucket_name_and_path(
                meta.client_meta.stage_info['location'])
            s3path = s3location.s3path + meta.dst_file_name.lstrip('/')

            akey = meta.client_meta.cloud_client.Object(
                s3location.bucket_name, s3path)
            extra_args = {'Metadata': s3_metadata}
            config = TransferConfig(
                multipart_threshold=multipart_threshold,
                max_concurrency=max_concurrency,
                num_download_attempts=10,
            )

            if meta.src_stream is None:
                akey.upload_file(
                    data_file,
                    Callback=meta.put_callback(
                        data_file,
                        os.path.getsize(data_file),
                        output_stream=meta.put_callback_output_stream,
                        show_progress_bar=meta.show_progress_bar)
                    if meta.put_callback else None,
                    ExtraArgs=extra_args,
                    Config=config)
            else:
                upload_stream = meta.real_src_stream or meta.src_stream
                upload_size = upload_stream.seek(0, os.SEEK_END)
                upload_stream.seek(0)

                akey.upload_fileobj(
                    upload_stream,
                    Callback=meta.put_callback(
                        data_file,
                        upload_size,
                        output_stream=meta.put_callback_output_stream,
                        show_progress_bar=meta.show_progress_bar)
                    if meta.put_callback else None,
                    ExtraArgs=extra_args,
                    Config=config,
                )

            logger.debug('DONE putting a file')
            meta.dst_file_size = meta.upload_size
            meta.result_status = ResultStatus.UPLOADED
        except botocore.exceptions.ClientError as err:
            if err.response['Error']['Code'] == EXPIRED_TOKEN:
                logger.debug("AWS Token expired. Renew and retry")
                meta.result_status = ResultStatus.RENEW_TOKEN
                return
            logger.debug(f"Failed to upload a file: {data_file}, err: {err}",
                         exc_info=True)
            raise err
        except S3UploadFailedError as err:
            if EXPIRED_TOKEN in str(err):
                # Since AWS token expiration error can be encapsulated in
                # S3UploadFailedError, the text match is required to
                # identify the case.
                logger.debug(
                    f'Failed to upload a file: {data_file}, err: {err}. Renewing AWS Token and Retrying'
                )
                meta.result_status = ResultStatus.RENEW_TOKEN
                return

            meta.last_error = err
            meta.result_status = ResultStatus.NEED_RETRY
        except OpenSSL.SSL.SysCallError as err:
            meta.last_error = err
            if err.args[0] == ERRORNO_WSAECONNABORTED:
                # connection was disconnected by S3
                # because of too many connections. retry with
                # less concurrency to mitigate it
                meta.result_status = ResultStatus.NEED_RETRY_WITH_LOWER_CONCURRENCY
            else:
                meta.result_status = ResultStatus.NEED_RETRY
Exemplo n.º 27
0
    def upload_file(data_file, meta, encryption_metadata, max_concurrency):
        logger = getLogger(__name__)
        try:
            s3_metadata = {
                HTTP_HEADER_CONTENT_TYPE: HTTP_HEADER_VALUE_OCTET_STREAM,
                SFC_DIGEST: meta[SHA256_DIGEST],
            }
            if (encryption_metadata):
                s3_metadata.update({
                    AMZ_IV: encryption_metadata.iv,
                    AMZ_KEY: encryption_metadata.key,
                    AMZ_MATDESC: encryption_metadata.matdesc,
                })
            s3location = SnowflakeS3Util.extract_bucket_name_and_path(
                meta[u'stage_info'][u'location'])
            s3path = s3location.s3path + meta[u'dst_file_name'].lstrip('/')

            akey = meta[u'client'].Object(s3location.bucket_name, s3path)
            akey.upload_file(
                data_file,
                Callback=meta[u'put_callback'](
                    data_file,
                    os.path.getsize(data_file),
                    output_stream=meta[u'put_callback_output_stream'],
                    show_progress_bar=meta[u'show_progress_bar'])
                if meta[u'put_callback'] else None,
                ExtraArgs={
                    u'Metadata': s3_metadata,
                },
                Config=TransferConfig(
                    multipart_threshold=SnowflakeS3Util.DATA_SIZE_THRESHOLD,
                    max_concurrency=max_concurrency,
                    num_download_attempts=10,
                ))

            logger.debug(u'DONE putting a file')
            meta[u'dst_file_size'] = meta[u'upload_size']
            meta[u'result_status'] = ResultStatus.UPLOADED
        except botocore.exceptions.ClientError as err:
            if err.response[u'Error'][u'Code'] == EXPIRED_TOKEN:
                logger.debug(u"AWS Token expired. Renew and retry")
                meta[u'result_status'] = ResultStatus.RENEW_TOKEN
                return
            logger.debug(u"Failed to upload a file: %s, err: %s",
                         data_file,
                         err,
                         exc_info=True)
            raise err
        except S3UploadFailedError as err:
            if EXPIRED_TOKEN in TO_UNICODE(err):
                # Since AWS token expiration error can be encapsulated in
                # S3UploadFailedError, the text match is required to
                # identify the case.
                logger.debug(
                    'Failed to upload a file: %s, err: %s. Renewing '
                    'AWS Token and Retrying', data_file, err)
                meta[u'result_status'] = ResultStatus.RENEW_TOKEN
                return

            meta[u'last_error'] = err
            meta[u'result_status'] = ResultStatus.NEED_RETRY
        except OpenSSL.SSL.SysCallError as err:
            meta[u'last_error'] = err
            if err.args[0] == ERRORNO_WSAECONNABORTED:
                # connection was disconnected by S3
                # because of too many connections. retry with
                # less concurrency to mitigate it
                meta[
                    u'result_status'] = ResultStatus.NEED_RETRY_WITH_LOWER_CONCURRENCY
            else:
                meta[u'result_status'] = ResultStatus.NEED_RETRY
Exemplo n.º 28
0
def s3_download(bucket,
                s3_filepath,
                local_filepath,
                profile_name='default',
                region_name='us-west-2',
                multipart_threshold=8388608,
                multipart_chunksize=8388608):
    """ Downloads a file or collection of files from S3

    Parameters
    ----------
    bucket : str
        name of S3 bucket
    s3_filepath : str or list
        path and filename within bucket to file(s) you would like to download
    local_filepath : str or list
        path and filename for file(s) to be saved locally
    profile_name : str
        profile name for credentials (default 'default' or organization-specific)
    region_name : str
        name of AWS region (default value 'us-west-2')
    multipart_threshold : int
        minimum file size to initiate multipart download
    multipart_chunksize : int
        chunksize for multipart download

    Returns
    -------
    None

    Example use
    -----------
    # Downloading a single file from S3:
    s3_download(
        bucket='my_bucket',
        s3_filepath='tmp/my_file.csv',
        local_filepath='../data/my_file.csv')

    # Downloading with a profile name:
    s3_download(
        bucket='my_bucket',
        profile_name='my-profile-name',
        s3_filepath='tmp/my_file.csv',
        local_filepath='../data/my_file.csv')

    # Downloading a list of files from S3 (will not upload contents of subdirectories):
    s3_download(
        bucket='my_bucket',
        s3_filepath=['tmp/my_file1.csv', 'tmp/my_file2.csv', 'img.png'],
        local_filepath=['../data/my_file1.csv', '../data/my_file2.csv', '../img.png'])

    # Downloading files matching a pattern from S3 (will not upload contents of subdirectories):
    s3_download(
        bucket='my_bucket',
        s3_filepath='tmp/*.csv',
        local_filepath='../data/')

    # Downloading all files in a directory from S3 (will not upload contents of subdirectories):
    s3_download(
        bucket='my_bucket',
        s3_filepath='tmp/*',
        local_filepath='../data/')
    """
    # validate s3_filepath and local_filepath arguments
    _download_upload_filepath_validator(s3_filepath=s3_filepath,
                                        local_filepath=local_filepath)
    # create bucket object
    my_bucket = s3_get_bucket(bucket=bucket,
                              profile_name=profile_name,
                              region_name=region_name)
    # multipart_threshold and multipart_chunksize, defaults = Amazon defaults
    config = TransferConfig(multipart_threshold=multipart_threshold,
                            multipart_chunksize=multipart_chunksize)
    if isinstance(s3_filepath, str):
        # find keys matching wildcard
        if '*' in s3_filepath:
            s3_filepath = _s3_glob(s3_filepath=s3_filepath,
                                   my_bucket=my_bucket)
            local_filepath = [
                os.path.join(local_filepath,
                             key.split('/')[-1]) for key in s3_filepath
            ]
        # insert into list so same looping structure can be used
        else:
            s3_filepath = [s3_filepath]
            local_filepath = [local_filepath]
    # download all files from S3
    for s3_key, local_file in zip(s3_filepath, local_filepath):
        try:
            my_bucket.download_file(s3_key, local_file, Config=config)
        except ClientError as e:
            error_code = int(e.response['Error']['Code'])
            if error_code == 400:
                raise NameError('The credentials are expired or not valid. ' +
                                str(e))
            else:
                raise e
    return
Exemplo n.º 29
0
def multi_part_upload_with_s3(filename=None,
                              key_path=None,
                              bucket=None,
                              upload_type="single"):
    start_time = default_timer()
    bucket_name_prefix = "prep-logs"
    key, sec = catchMeIfYouCan(aawwss_text)
    aaa_env, sss_env = catchMeIfYouCan(aawwss_env)
    os.environ[aaa_env] = key
    os.environ[sss_env] = sec
    if bucket is None or bucket == "":
        BUCKET_NAME = f"{bucket_name_prefix}-kr"
    else:
        BUCKET_NAME = f"{bucket_name_prefix}{bucket}"
    cprint(f"\t bucket {bucket} -> {BUCKET_NAME}") if args.verbose else False
    if bucket == "-hk":
        s3 = boto3.resource('s3', region_name="ap-east-1")
    else:
        s3 = boto3.resource('s3', )
    ##single parts
    if upload_type == "single":
        s3.meta.client.meta.events.register('choose-signer.s3.*',
                                            disable_signing)
        # config = TransferConfig(use_threads=True, multipart_threshold=1024*1024*8, multipart_chunksize=1024*1024*8)
        config = TransferConfig(multipart_threshold=838860800,
                                max_concurrency=10,
                                multipart_chunksize=8388608,
                                num_download_attempts=5,
                                max_io_queue=100,
                                io_chunksize=262144,
                                use_threads=True)
    # multiparts mode -> AWS S3 CLI: Anonymous users cannot initiate multipart uploads
    elif upload_type == "multi":
        pass
        config = TransferConfig(multipart_threshold=1024 * 25,
                                max_concurrency=10,
                                multipart_chunksize=1024 * 25,
                                use_threads=True)
    else:
        cprint(f"Unknown upload_type-> {upload_type}", "red")
    if filename is None:
        cprint(f"[ERROR] filename is None", "red")
        raise SystemExit()
    if key_path is None:
        key_path = filename
    try:
        s3.meta.client.upload_file(
            filename,
            BUCKET_NAME,
            key_path,
            # ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/pdf'},
            Config=config,
            Callback=ProgressPercentage(filename))
    except Exception as e:
        e = str(e).replace(":", ":\n")
        cprint(f"\n[ERROR] File upload fail / cause->{e}\n", "red")
        raise SystemExit()

    elapsed = default_timer() - start_time
    time_completed_at = "{:5.3f}s".format(elapsed)

    cprint(f"\n\t time_completed_at = {time_completed_at}")
Exemplo n.º 30
0
def stream_time_range_s3(start_date,
                         end_date,
                         aws_key,
                         aws_secret,
                         aws_bucket_name,
                         key,
                         delta,
                         max_workers=10):
    """
    Download individual month directory of .grd files to local directory.

    This function will download using the ftplib all the .grd files between the
    start_date and the end_date. All dates in the NOAA NARR FTP server are
    stored following this order:
        data
        ├── year/month
            ├── year/month/day01
            ├── year/month/day02

    Here we download the monthly directory with the user-defined dates in the
    start and end dates. 

    Params:
        - start_year str: year to start download.
        - end_year str: year to stop download.
    """

    GB = 1024**3

    session = boto3.Session(profile_name='default')
    s3 = session.client('s3')
    config = TransferConfig(multipart_threshold=5 * GB)

    base_url = 'https://nomads.ncdc.noaa.gov/data/narr'
    time = ['0000', '0300', '0600', '0900', '1200', '1500', '1800', '2100']

    if not isinstance(start_date, datetime) and isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    else:
        ValueError(
            f'{start_date} is not in the correct format or not a valid type')

    if delta is None:
        dates = datetime_range(start_date, end_date, {'days': 1})
    else:
        dates = datetime_range(start_date, end_date + delta)

    urls_time_range = []
    for day, time in product(dates, time):
        file_name = f'narr-a_221_{day.strftime("%Y%m%d")}_{time}_000.grb'
        url = URL(base_url, day.strftime('%Y%m'), day.strftime('%Y%m%d'))
        urls_time_range.append(str(URL(url, file_name)))

    with multiprocessing.Pool(max_workers) as p:
        results = p.map(requests_to_s3, urls_time_range, chunksize=1)

        logger.info('Finish download')
        temp_dir = mkdtemp()
        temp_file = NamedTemporaryFile()
        path_to_temp_file = os.path.join(temp_dir, temp_file.name)
        with zipfile.ZipFile(path_to_temp_file,
                             mode='w',
                             compression=zipfile.ZIP_DEFLATED,
                             compresslevel=1) as zf:
            for content_file_name, content_file_result in results:
                try:
                    zf.writestr(content_file_name, content_file_result)
                except Exception as exc:
                    print(exc)

        logger.info('Finish zipping  - Upload Start')
        s3.upload_file(path_to_temp_file, aws_bucket_name, key, Config=config)

    return path_to_temp_file
def source_dataset():
    source_dataset_url = get_dataset_url()
    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response is None:
        raise Exception('There was an issue downloading the dataset')

    data_set_name = os.environ['DATA_SET_NAME']

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    file_location = os.path.join(data_dir, data_set_name+'.csv')

    s3_bucket = os.environ['S3_BUCKET']
    s3 = boto3.client('s3')
    s3_resource = boto3.resource('s3')
    config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10,
                            multipart_chunksize=1024*25, use_threads=True)

    s3_uploads = []
    asset_list = []

    obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower()
    file_location = os.path.join(data_dir, obj_name)
    new_s3_key = data_set_name + '/dataset/' + obj_name
    filedata = response.read()

    has_changes = md5_compare(s3, s3_bucket, new_s3_key, BytesIO(filedata))
    if has_changes:
        s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata)
        print('Uploaded: ' + file_location)
    else:
        print('No changes in: ' + file_location)

    asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
    s3_uploads.append({'has_changes': has_changes,
                       'asset_source': asset_source})

    count_updated_data = sum(
        upload['has_changes'] == True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(
            map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in lamdba_handler function
    # if it is empty, lambda_handler will not republish
    return asset_list