def _filter_resources(self, resource_iterator, wildcard_pattern): """Filter out resources that do not match the wildcard_pattern. Args: resource_iterator (iterable): An iterable resource_reference.Resource objects. wildcard_pattern (str): The wildcard_pattern to filter the resources. Yields: resource_reference.Resource objects matching the wildcard_pattern """ regex_patterns = self._get_regex_patterns(wildcard_pattern) for resource in resource_iterator: if isinstance(resource, resource_reference.PrefixResource): object_name = (storage_url.rstrip_one_delimiter( resource.storage_url.object_name)) else: # Do not strip trailing delimiters for object resources, otherwise they # will be filtered out incorrectly. object_name = resource.storage_url.object_name if (self._url.generation and resource.storage_url.generation != self._url.generation): # Filter based on generation, if generation is present in the request. continue for regex_pattern in regex_patterns: if regex_pattern.match(object_name): yield resource break
def __init__(self, url, all_versions=False, error_on_missing_key=True, fields_scope=cloud_api.FieldsScope.NO_ACL, get_bucket_metadata=False): """Instantiates an iterator that matches the wildcard URL. Args: url (CloudUrl): CloudUrl that may contain wildcard that needs expansion. all_versions (bool): If true, the iterator yields all versions of objects matching the wildcard. If false, yields just the live object version. error_on_missing_key (bool): If true, and the encryption key needed to decrypt an object is missing, the iterator raises an error for that object. fields_scope (cloud_api.FieldsScope): Determines amount of metadata returned by API. get_bucket_metadata (bool): If true, perform a bucket GET request when fetching bucket resources """ super(CloudWildcardIterator, self).__init__() url = _compress_url_wildcards(url) self._url = url self._all_versions = all_versions self._error_on_missing_key = error_on_missing_key self._fields_scope = fields_scope self._get_bucket_metadata = get_bucket_metadata self._client = api_factory.get_api(url.scheme) if url.url_string.endswith(url.delimiter): # Forces the API to return prefixes instead of their contents. url = storage_url.storage_url_from_string( storage_url.rstrip_one_delimiter(url.url_string))
def _get_destination_suffix_for_recursion(self, destination_container, source): """Returns the suffix required to complete the destination URL. Let's assume the following: User command => cp -r */base_dir gs://dest/existing_prefix source.resource.storage_url => a/base_dir/c/d.txt source.expanded_url => a/base_dir destination_container.storage_url => gs://dest/existing_prefix If the destination container exists, the entire directory gets copied: Result => gs://dest/existing_prefix/base_dir/c/d.txt On the other hand, if the destination container does not exist, the top-level dir does not get copied over. Result => gs://dest/existing_prefix/c/d.txt Args: destination_container (resource_reference.Resource): The destination container. source (NameExpansionResult): Represents the source resource and the expanded parent url in case of recursion. Returns: (str) The suffix to be appended to the destination container. """ source_prefix_to_ignore = storage_url.rstrip_one_delimiter( source.expanded_url.versionless_url_string, source.expanded_url.delimiter) if (not isinstance(destination_container, resource_reference.UnknownResource) and destination_container.is_container()): # Destination container exists. This means we need to preserve the # top-level source directory. # Remove the leaf name so that it gets added to the destination. source_prefix_to_ignore = source_prefix_to_ignore.rpartition( source.expanded_url.delimiter)[0] if not source_prefix_to_ignore: # In case of Windows, the source URL might not contain any Windows # delimiter if it was a single directory (e.g file://dir) and # source_prefix_to_ignore will be empty. Set it to <scheme>://. # TODO(b/169093672) This will not be required if we get rid of file:// source_prefix_to_ignore = source.expanded_url.scheme.value + '://' full_source_url = source.resource.storage_url.versionless_url_string suffix_for_destination = full_source_url.split( source_prefix_to_ignore)[1] # Windows uses \ as a delimiter. Force the suffix to use the same # delimiter used by the destination container. source_delimiter = source.resource.storage_url.delimiter destination_delimiter = destination_container.storage_url.delimiter if source_delimiter != destination_delimiter: return suffix_for_destination.replace(source_delimiter, destination_delimiter) return suffix_for_destination
def _filter_resources(self, resource_iterator, wildcard_pattern): """Filter out resources that do not match the wildcard_pattern. Args: resource_iterator (iterable): An iterable resource_reference.Resource objects. wildcard_pattern (str): The wildcard_pattern to filter the resources. Yields: resource_reference.Resource objects matching the wildcard_pattern """ regex_string = fnmatch.translate(wildcard_pattern) regex_pattern = re.compile(regex_string) for resource in resource_iterator: # A prefix resource returned by the API will always end with a slash. # We strip the slash in the end to match cases like gs://bucket/folder1 object_name = storage_url.rstrip_one_delimiter( resource.storage_url.object_name) if (self._url.generation and resource.storage_url.generation != self._url.generation): # Filter based on generation, if generation is present in the request. continue if regex_pattern.match(object_name): yield resource
def __init__(self, url, all_versions=False, fields_scope=cloud_api.FieldsScope.NO_ACL): """Instantiates an iterator that matches the wildcard URL. Args: url (CloudUrl): CloudUrl that may contain wildcard that needs expansion. all_versions (bool): If true, the iterator yields all versions of objects matching the wildcard. If false, yields just the live object version. fields_scope (cloud_api.FieldsScope): Determines amount of metadata returned by API. """ super(CloudWildcardIterator, self).__init__() url = _compress_url_wildcards(url) self._url = url self._all_versions = all_versions self._fields_scope = fields_scope self._client = api_factory.get_api(url.scheme) if url.url_string.endswith(url.delimiter): # Forces the API to return prefixes instead of their contents. url = storage_url.storage_url_from_string( storage_url.rstrip_one_delimiter(url.url_string))
def _get_destination_suffix_for_recursion(self, destination_container, source): """Returns the suffix required to complete the destination URL. Let's assume the following: User command => cp -r */base_dir gs://dest/existing_prefix source.resource.storage_url => a/base_dir/c/d.txt source.expanded_url => a/base_dir destination_container.storage_url => gs://dest/existing_prefix If the destination container exists, the entire directory gets copied: Result => gs://dest/existing_prefix/base_dir/c/d.txt Args: destination_container (resource_reference.Resource): The destination container. source (NameExpansionResult): Represents the source resource and the expanded parent url in case of recursion. Returns: (str) The suffix to be appended to the destination container. """ source_prefix_to_ignore = storage_url.rstrip_one_delimiter( source.expanded_url.versionless_url_string, source.expanded_url.delimiter) expanded_url_is_valid_parent = _is_expanded_url_valid_parent_dir( source.expanded_url) if not expanded_url_is_valid_parent and self._has_multiple_top_level_sources: # To avoid top-level name conflicts, we need to copy the parent dir. # However, that cannot be done because the parent dir has an invalid name. raise errors.InvalidUrlError( 'Presence of multiple top-level sources and invalid expanded URL' ' make file name conflicts possible for URL: {}'.format( source.resource)) is_top_level_source_object_name_conflict_possible = ( isinstance(destination_container, resource_reference.UnknownResource) and self._has_multiple_top_level_sources) destination_is_existing_dir = (not isinstance( destination_container, resource_reference.UnknownResource) and destination_container.is_container()) if is_top_level_source_object_name_conflict_possible or ( expanded_url_is_valid_parent and destination_is_existing_dir): # Preserve the top-level source directory, and remove the leaf name # so that it gets added to the destination. source_prefix_to_ignore, _, _ = source_prefix_to_ignore.rpartition( source.expanded_url.delimiter) if not source_prefix_to_ignore: # In case of Windows, the source URL might not contain any Windows # delimiter if it was a single directory (e.g file://dir) and # source_prefix_to_ignore will be empty. Set it to <scheme>://. # TODO(b/169093672) This will not be required if we get rid of file:// source_prefix_to_ignore = source.expanded_url.scheme.value + '://' full_source_url = source.resource.storage_url.versionless_url_string suffix_for_destination = full_source_url.split(source_prefix_to_ignore)[1] # Windows uses \ as a delimiter. Force the suffix to use the same # delimiter used by the destination container. source_delimiter = source.resource.storage_url.delimiter destination_delimiter = destination_container.storage_url.delimiter if source_delimiter != destination_delimiter: return suffix_for_destination.replace(source_delimiter, destination_delimiter) return suffix_for_destination
def _expand_object_path(self, bucket_name): """If wildcard, expand object names. Recursively expand each folder with wildcard. Args: bucket_name (str): Name of the bucket. Yields: resource_reference.Resource objects where each resource can be an ObjectResource object or a PrefixResource object. """ # Retain original name to see if user wants only prefixes. original_object_name = self._url.object_name # Force API to return prefix resource not the prefix's contents. object_name = storage_url.rstrip_one_delimiter(original_object_name) names_needing_expansion = collections.deque([object_name]) while names_needing_expansion: name = names_needing_expansion.popleft() # Parse out the prefix, delimiter, filter_pattern and suffix. # Given a string 'a/b*c/d/e*f/g.txt', this will return # CloudWildcardParts(prefix='a/b', filter_pattern='*c', # delimiter='/', suffix='d/e*f/g.txt') wildcard_parts = CloudWildcardParts.from_string( name, self._url.delimiter) # Fetch all the objects and prefixes. resource_iterator = self._client.list_objects( all_versions=self._all_versions or bool(self._url.generation), bucket_name=bucket_name, delimiter=wildcard_parts.delimiter, fields_scope=self._fields_scope, prefix=wildcard_parts.prefix or None) # We have all the objects and prefixes that matched the # wildcard_parts.prefix. Use the filter_pattern to eliminate non-matching # objects and prefixes. filtered_resources = self._filter_resources( resource_iterator, wildcard_parts.prefix + wildcard_parts.filter_pattern) for resource in filtered_resources: if wildcard_parts.suffix: if isinstance(resource, resource_reference.PrefixResource): # Suffix is present, which indicates that we have more wildcards to # expand. Let's say object_name is a/b1c. Then the new string that # we want to expand will be a/b1c/d/e*f/g.txt names_needing_expansion.append( resource.storage_url.object_name + wildcard_parts.suffix) else: # Make sure an object is not returned if the original query was for # a prefix. if (not resource.storage_url.object_name.endswith( self._url.delimiter) and original_object_name.endswith( self._url.delimiter)): continue yield resource