예제 #1
0
def list_files_in_bucket(config, bucket, search_prefix=None):
    s3_client = boto3.client('s3')

    s3_objects = []

    max_results = 1000
    args = {
        'Bucket': bucket,
        'MaxKeys': max_results,
    }

    if search_prefix is not None:
        args['Prefix'] = search_prefix

    result = s3_client.list_objects_v2(**args)

    s3_objects += result['Contents']
    next_continuation_token = result.get('NextContinuationToken')

    while next_continuation_token is not None:
        logger.debug('Continuing pagination with token "{}".'.format(
            next_continuation_token))

        continuation_args = args.copy()
        continuation_args['ContinuationToken'] = next_continuation_token

        result = s3_client.list_objects_v2(**continuation_args)

        s3_objects += result['Contents']
        next_continuation_token = result.get('NextContinuationToken')

    logger.info("Found {} files.".format(len(s3_objects)))

    return s3_objects
예제 #2
0
파일: s3.py 프로젝트: stvhanna/tap-s3-csv-1
def get_input_files_for_table(config, table_spec, modified_since=None):
    bucket = config['bucket']

    to_return = []
    pattern = table_spec['pattern']
    matcher = re.compile(pattern)

    logger.debug('Checking bucket "{}" for keys matching "{}"'.format(
        bucket, pattern))

    s3_objects = list_files_in_bucket(config, bucket,
                                      table_spec.get('search_prefix'))

    for s3_object in s3_objects:
        key = s3_object['Key']
        last_modified = s3_object['LastModified']

        logger.debug('Last modified: {}'.format(last_modified))

        if (matcher.search(key) and
            (modified_since is None or modified_since < last_modified)):
            logger.debug('Will download key "{}"'.format(key))
            to_return.append({'key': key, 'last_modified': last_modified})
        else:
            logger.debug('Will not download key "{}"'.format(key))

    to_return = sorted(to_return, key=lambda item: item['last_modified'])

    return to_return
예제 #3
0
def convert_row(row, schema):
    to_return = {}

    for key, value in row.items():
        field_schema = schema['properties'][key]
        datatype = field_schema.get('_conversion_type', 'string')

        logger.debug('Converting {} value {} to {}'.format(
            key, value, datatype))
        converted, _ = convert(value, datatype)

        to_return[key] = converted

    return to_return