def test_copytree_to_from_dir_w_manifest(self): num_test_objs = 10 test_obj_size = 100 with NamedTemporaryDirectory(change_dir=True) as tmp_d: self.create_dataset(tmp_d, num_test_objs, test_obj_size) # Make a nested file and an empty directory for testing purposes tmp_d = Path(tmp_d) os.mkdir(tmp_d / 'my_dir') open(tmp_d / 'my_dir' / 'empty_file', 'w').close() os.mkdir(tmp_d / 'my_dir' / 'empty_dir') stor.copytree('.', self.test_dir, use_manifest=True) # Validate the contents of the manifest file manifest_contents = utils.get_data_manifest_contents(self.test_dir) expected_contents = self.get_dataset_obj_names(num_test_objs) expected_contents.extend( ['my_dir/empty_file', 'my_dir/empty_dir/']) expected_contents = [Path('test') / c for c in expected_contents] self.assertEquals(set(manifest_contents), set(expected_contents)) with NamedTemporaryDirectory(change_dir=True) as tmp_d: # Download the results successfully Path(self.test_dir).copytree('test', use_manifest=True) # Now delete one of the objects from s3. A second download # will fail with a condition error Path(self.test_dir / 'my_dir' / 'empty_dir/').remove() with self.assertRaises(exceptions.ConditionNotMetError): Path(self.test_dir).copytree('test', use_manifest=True, num_retries=0)
def test_upload_multiple_dirs(self): with NamedTemporaryDirectory(change_dir=True) as tmp_d: num_test_objs = 10 tmp_d = Path(tmp_d) # Create files filled with random data. path1 = tmp_d / 'dir1' os.mkdir(path1) self.create_dataset(path1, num_test_objs, 10) # Create empty dir and file. path2 = tmp_d / 'dir2' os.mkdir(path2) os.mkdir(path2 / 'my_dir') open(path2 / 'my_dir' / 'included_file', 'w').close() open(path2 / 'my_dir' / 'excluded_file', 'w').close() os.mkdir(path2 / 'my_dir' / 'included_dir') os.mkdir(path2 / 'my_dir' / 'excluded_dir') # Create file in the top level directory. open(tmp_d / 'top_level_file', 'w').close() to_upload = [ 'dir1', 'dir2/my_dir/included_file', 'dir2/my_dir/included_dir', 'top_level_file', ] with tmp_d: swift_path = self.test_dir / 'subdir' swift_path.upload(to_upload, use_manifest=True) # Validate the contents of the manifest file manifest_contents = utils.get_data_manifest_contents(swift_path) expected_contents = [ Path('dir1') / name for name in self.get_dataset_obj_names(num_test_objs) ] expected_contents.extend([ 'dir2/my_dir/included_file', 'dir2/my_dir/included_dir', 'top_level_file', ]) expected_contents = [ Path('test/subdir') / c for c in expected_contents ] self.assertEquals(set(manifest_contents), set(expected_contents))
def download(self, dest, condition=None, use_manifest=False, **kwargs): """Downloads a directory from S3 to a destination directory. Args: dest (str): The destination path to download file to. If downloading to a directory, there must be a trailing slash. The directory will be created if it doesn't exist. condition (function(results) -> bool): The method will only return when the results of download matches the condition. Returns: List[S3Path]: A list of the downloaded objects. Notes: - The destination directory will be created automatically if it doesn't exist. - This method downloads to paths relative to the current directory. """ utils.validate_condition(condition) if use_manifest: object_names = utils.get_data_manifest_contents(self) manifest_cond = partial(utils.validate_manifest_list, object_names) condition = (utils.join_conditions(condition, manifest_cond) if condition else manifest_cond) source = utils.with_trailing_slash(self) files_to_download = [{ 'source': file, 'dest': dest } for file in source.list()] options = settings.get()['s3:download'] segment_size = utils.str_to_bytes(options.get('segment_size')) transfer_config = { 'multipart_threshold': segment_size, 'max_concurrency': options.get('segment_threads'), 'multipart_chunksize': segment_size } download_w_config = partial(self._download_object_worker, config=transfer_config) downloaded = {'completed': [], 'failed': []} with S3DownloadLogger(len(files_to_download)) as dl: pool = ThreadPool(options['object_threads']) try: result_iter = pool.imap_unordered(download_w_config, files_to_download) while True: try: result = result_iter.next(0xFFFF) if result['success']: dl.add_result(result) downloaded['completed'].append(result) else: downloaded['failed'].append(result) except StopIteration: break pool.close() except BaseException: pool.terminate() raise finally: pool.join() if downloaded['failed']: raise exceptions.FailedDownloadError( 'an error occurred while downloading', downloaded) utils.check_condition(condition, [r['source'] for r in downloaded['completed']]) return downloaded
def list( self, starts_with=None, limit=None, condition=None, use_manifest=False, # hidden args list_as_dir=False, ignore_dir_markers=False, **kwargs): """ List contents using the resource of the path as a prefix. Args: starts_with (str): Allows for an additional search path to be appended to the current swift path. The current path will be treated as a directory. limit (int): Limit the amount of results returned. condition (function(results) -> bool): The method will only return when the results matches the condition. use_manifest (bool): Perform the list and use the data manfest file to validate the list. Returns: List[S3Path]: Every path in the listing Raises: RemoteError: An s3 client error occurred. ConditionNotMetError: Results were returned, but they did not meet the condition. """ bucket = self.bucket prefix = self.resource utils.validate_condition(condition) if use_manifest: object_names = utils.get_data_manifest_contents(self) manifest_cond = partial(utils.validate_manifest_list, object_names) condition = (utils.join_conditions(condition, manifest_cond) if condition else manifest_cond) if starts_with: prefix = prefix / starts_with if prefix else starts_with else: prefix = prefix or '' list_kwargs = { 'Bucket': bucket, 'Prefix': prefix, 'PaginationConfig': {} } if limit: list_kwargs['PaginationConfig']['MaxItems'] = limit if list_as_dir: # Ensure the the prefix has a trailing slash if there is a prefix list_kwargs['Prefix'] = utils.with_trailing_slash( prefix) if prefix else '' list_kwargs['Delimiter'] = '/' path_prefix = S3Path('%s%s' % (self.drive, bucket)) results = self._get_s3_iterator('list_objects_v2', **list_kwargs) list_results = [] try: for page in results: if 'Contents' in page: list_results.extend([ path_prefix / result['Key'] for result in page['Contents'] if not ignore_dir_markers or ( ignore_dir_markers and not utils.has_trailing_slash(result['Key'])) ]) if list_as_dir and 'CommonPrefixes' in page: list_results.extend([ path_prefix / result['Prefix'] for result in page['CommonPrefixes'] ]) except botocore_exceptions.ClientError as e: raise _parse_s3_error(e) from e utils.check_condition(condition, list_results) return list_results