def glob(self, pattern, condition=None, canonicalize=False): """ Glob for pattern relative to this directory.""" results = list( self.walkfiles(canonicalize=canonicalize, pattern=pattern)) if not results or not results[0]: # when results == [[]] results = [] utils.validate_condition(condition) utils.check_condition(condition, results) return results
def list(self, canonicalize=False, starts_with=None, limit=None, classname=None, condition=None): """List contents using the resource of the path as a prefix. This will only list the file resources (and not empty directories like other OBS). .. warning:: Prefer `list_iter()` to this method in production code. If there are many files (i.e., more than 1-2K) to list, this method may take a long time to return and use a lot of memory to construct all of the objects. Examples: >>> Path('dx://MyProject:/my/path/').list(canonicalize=False) [Path('dx://MyProject:/my/path/to/file.txt, ...] >>> Path('dx://MyProject:/my/path/').list(canonicalize=True) [Path('dx://project-123:file-123'), ...] Args: canonicalize (bool, default False): if True, return canonical paths starts_with (str): Allows for an additional search path to be appended to the resource of the dx path. Note that this resource path is treated as a directory limit (int): Limit the amount of results returned classname (str): Restricting class : One of 'record', 'file', 'gtable, 'applet', 'workflow' condition (function(results) -> bool): The method will only return when the results matches the condition. Returns: List[DXPath]: Iterates over listed files that match an optional pattern. """ results = list( self.walkfiles(canonicalize=canonicalize, starts_with=starts_with, limit=limit, classname=classname)) if not results or not results[0]: # when results == [[]] results = [] utils.validate_condition(condition) utils.check_condition(condition, results) return results
def upload(self, source, condition=None, use_manifest=False, headers=None, **kwargs): """Uploads a list of files and directories to s3. Note that the S3Path is treated as a directory. Note that for user-provided OBSUploadObjects, an empty directory's destination must have a trailing slash. Args: source (List[str|OBSUploadObject]): A list of source files, directories, and OBSUploadObjects to upload to S3. condition (function(results) -> bool): The method will only return when the results of upload matches the condition. use_manifest (bool): Generate a data manifest and validate the upload results are in the manifest. headers (dict): A dictionary of object headers to apply to the object. Headers will not be applied to OBSUploadObjects and any headers specified by an OBSUploadObject will override these headers. Headers should be specified as key-value pairs, e.g. {'ContentLanguage': 'en'} Returns: List[S3Path]: A list of the uploaded files as S3Paths. Notes: - This method uploads to paths relative to the current directory. """ if use_manifest and not (len(source) == 1 and os.path.isdir(source[0])): raise ValueError( 'can only upload one directory with use_manifest=True') utils.validate_condition(condition) files_to_convert = utils.walk_files_and_dirs( [name for name in source if not isinstance(name, OBSUploadObject)]) files_to_upload = [ obj for obj in source if isinstance(obj, OBSUploadObject) ] manifest_file_name = (Path(source[0]) / utils.DATA_MANIFEST_FILE_NAME if use_manifest else None) resource_base = self.resource or Path('') files_to_upload.extend([ OBSUploadObject( name, resource_base / (utils.with_trailing_slash( utils.file_name_to_object_name(name)) if Path(name).isdir() else utils.file_name_to_object_name(name)), options={'headers': headers} if headers else None) for name in files_to_convert if name != manifest_file_name ]) if use_manifest: # Generate the data manifest and save it remotely object_names = [o.object_name for o in files_to_upload] utils.generate_and_save_data_manifest(source[0], object_names) manifest_obj_name = resource_base / utils.file_name_to_object_name( manifest_file_name) manifest_obj = OBSUploadObject( str(manifest_file_name), manifest_obj_name, options={'headers': headers} if headers else None) self._upload_object(manifest_obj) # Make a condition for validating the upload manifest_cond = partial(utils.validate_manifest_list, object_names) condition = (utils.join_conditions(condition, manifest_cond) if condition else manifest_cond) options = settings.get()['s3:upload'] segment_size = utils.str_to_bytes(options.get('segment_size')) transfer_config = { 'multipart_threshold': segment_size, 'max_concurrency': options.get('segment_threads'), 'multipart_chunksize': segment_size } upload_w_config = partial(self._upload_object, config=transfer_config) uploaded = {'completed': [], 'failed': []} with S3UploadLogger(len(files_to_upload)) as ul: pool = ThreadPool(options['object_threads']) try: result_iter = pool.imap_unordered(upload_w_config, files_to_upload) while True: try: result = result_iter.next(0xFFFF) if result['success']: ul.add_result(result) uploaded['completed'].append(result) else: uploaded['failed'].append(result) except StopIteration: break pool.close() except BaseException: pool.terminate() raise finally: pool.join() if uploaded['failed']: raise exceptions.FailedUploadError( 'an error occurred while uploading', uploaded) utils.check_condition(condition, [r['dest'] for r in uploaded['completed']]) return uploaded
def download(self, dest, condition=None, use_manifest=False, **kwargs): """Downloads a directory from S3 to a destination directory. Args: dest (str): The destination path to download file to. If downloading to a directory, there must be a trailing slash. The directory will be created if it doesn't exist. condition (function(results) -> bool): The method will only return when the results of download matches the condition. Returns: List[S3Path]: A list of the downloaded objects. Notes: - The destination directory will be created automatically if it doesn't exist. - This method downloads to paths relative to the current directory. """ utils.validate_condition(condition) if use_manifest: object_names = utils.get_data_manifest_contents(self) manifest_cond = partial(utils.validate_manifest_list, object_names) condition = (utils.join_conditions(condition, manifest_cond) if condition else manifest_cond) source = utils.with_trailing_slash(self) files_to_download = [{ 'source': file, 'dest': dest } for file in source.list()] options = settings.get()['s3:download'] segment_size = utils.str_to_bytes(options.get('segment_size')) transfer_config = { 'multipart_threshold': segment_size, 'max_concurrency': options.get('segment_threads'), 'multipart_chunksize': segment_size } download_w_config = partial(self._download_object_worker, config=transfer_config) downloaded = {'completed': [], 'failed': []} with S3DownloadLogger(len(files_to_download)) as dl: pool = ThreadPool(options['object_threads']) try: result_iter = pool.imap_unordered(download_w_config, files_to_download) while True: try: result = result_iter.next(0xFFFF) if result['success']: dl.add_result(result) downloaded['completed'].append(result) else: downloaded['failed'].append(result) except StopIteration: break pool.close() except BaseException: pool.terminate() raise finally: pool.join() if downloaded['failed']: raise exceptions.FailedDownloadError( 'an error occurred while downloading', downloaded) utils.check_condition(condition, [r['source'] for r in downloaded['completed']]) return downloaded
def list( self, starts_with=None, limit=None, condition=None, use_manifest=False, # hidden args list_as_dir=False, ignore_dir_markers=False, **kwargs): """ List contents using the resource of the path as a prefix. Args: starts_with (str): Allows for an additional search path to be appended to the current swift path. The current path will be treated as a directory. limit (int): Limit the amount of results returned. condition (function(results) -> bool): The method will only return when the results matches the condition. use_manifest (bool): Perform the list and use the data manfest file to validate the list. Returns: List[S3Path]: Every path in the listing Raises: RemoteError: An s3 client error occurred. ConditionNotMetError: Results were returned, but they did not meet the condition. """ bucket = self.bucket prefix = self.resource utils.validate_condition(condition) if use_manifest: object_names = utils.get_data_manifest_contents(self) manifest_cond = partial(utils.validate_manifest_list, object_names) condition = (utils.join_conditions(condition, manifest_cond) if condition else manifest_cond) if starts_with: prefix = prefix / starts_with if prefix else starts_with else: prefix = prefix or '' list_kwargs = { 'Bucket': bucket, 'Prefix': prefix, 'PaginationConfig': {} } if limit: list_kwargs['PaginationConfig']['MaxItems'] = limit if list_as_dir: # Ensure the the prefix has a trailing slash if there is a prefix list_kwargs['Prefix'] = utils.with_trailing_slash( prefix) if prefix else '' list_kwargs['Delimiter'] = '/' path_prefix = S3Path('%s%s' % (self.drive, bucket)) results = self._get_s3_iterator('list_objects_v2', **list_kwargs) list_results = [] try: for page in results: if 'Contents' in page: list_results.extend([ path_prefix / result['Key'] for result in page['Contents'] if not ignore_dir_markers or ( ignore_dir_markers and not utils.has_trailing_slash(result['Key'])) ]) if list_as_dir and 'CommonPrefixes' in page: list_results.extend([ path_prefix / result['Prefix'] for result in page['CommonPrefixes'] ]) except botocore_exceptions.ClientError as e: raise _parse_s3_error(e) from e utils.check_condition(condition, list_results) return list_results
def test_invalid_condition_args(self): with self.assertRaisesRegexp(ValueError, 'exactly one argument'): utils.validate_condition(lambda: True) # pragma: no cover
def test_invalid_condition_type(self): with self.assertRaisesRegexp(ValueError, 'must be callable'): utils.validate_condition('bad_cond')