def list_files_in_bucket(config, bucket, search_prefix=None): s3_client = boto3.client('s3') s3_objects = [] max_results = 1000 args = { 'Bucket': bucket, 'MaxKeys': max_results, } if search_prefix is not None: args['Prefix'] = search_prefix result = s3_client.list_objects_v2(**args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') while next_continuation_token is not None: logger.debug('Continuing pagination with token "{}".'.format( next_continuation_token)) continuation_args = args.copy() continuation_args['ContinuationToken'] = next_continuation_token result = s3_client.list_objects_v2(**continuation_args) s3_objects += result['Contents'] next_continuation_token = result.get('NextContinuationToken') logger.info("Found {} files.".format(len(s3_objects))) return s3_objects
def get_input_files_for_table(config, table_spec, modified_since=None): bucket = config['bucket'] to_return = [] pattern = table_spec['pattern'] matcher = re.compile(pattern) logger.debug('Checking bucket "{}" for keys matching "{}"'.format( bucket, pattern)) s3_objects = list_files_in_bucket(config, bucket, table_spec.get('search_prefix')) for s3_object in s3_objects: key = s3_object['Key'] last_modified = s3_object['LastModified'] logger.debug('Last modified: {}'.format(last_modified)) if (matcher.search(key) and (modified_since is None or modified_since < last_modified)): logger.debug('Will download key "{}"'.format(key)) to_return.append({'key': key, 'last_modified': last_modified}) else: logger.debug('Will not download key "{}"'.format(key)) to_return = sorted(to_return, key=lambda item: item['last_modified']) return to_return
def convert_row(row, schema): to_return = {} for key, value in row.items(): field_schema = schema['properties'][key] datatype = field_schema.get('_conversion_type', 'string') logger.debug('Converting {} value {} to {}'.format( key, value, datatype)) converted, _ = convert(value, datatype) to_return[key] = converted return to_return