def _filter_resources(self, resource_iterator, wildcard_pattern):
        """Filter out resources that do not match the wildcard_pattern.

    Args:
      resource_iterator (iterable): An iterable resource_reference.Resource
        objects.
      wildcard_pattern (str): The wildcard_pattern to filter the resources.

    Yields:
      resource_reference.Resource objects matching the wildcard_pattern
    """
        regex_patterns = self._get_regex_patterns(wildcard_pattern)
        for resource in resource_iterator:
            if isinstance(resource, resource_reference.PrefixResource):
                object_name = (storage_url.rstrip_one_delimiter(
                    resource.storage_url.object_name))
            else:
                # Do not strip trailing delimiters for object resources, otherwise they
                # will be filtered out incorrectly.
                object_name = resource.storage_url.object_name

            if (self._url.generation and
                    resource.storage_url.generation != self._url.generation):
                # Filter based on generation, if generation is present in the request.
                continue
            for regex_pattern in regex_patterns:
                if regex_pattern.match(object_name):
                    yield resource
                    break
    def __init__(self,
                 url,
                 all_versions=False,
                 error_on_missing_key=True,
                 fields_scope=cloud_api.FieldsScope.NO_ACL,
                 get_bucket_metadata=False):
        """Instantiates an iterator that matches the wildcard URL.

    Args:
      url (CloudUrl): CloudUrl that may contain wildcard that needs expansion.
      all_versions (bool): If true, the iterator yields all versions of objects
          matching the wildcard.  If false, yields just the live object version.
      error_on_missing_key (bool): If true, and the encryption key needed to
          decrypt an object is missing, the iterator raises an error for that
          object.
      fields_scope (cloud_api.FieldsScope): Determines amount of metadata
          returned by API.
      get_bucket_metadata (bool): If true, perform a bucket GET request when
          fetching bucket resources
    """
        super(CloudWildcardIterator, self).__init__()
        url = _compress_url_wildcards(url)
        self._url = url
        self._all_versions = all_versions
        self._error_on_missing_key = error_on_missing_key
        self._fields_scope = fields_scope
        self._get_bucket_metadata = get_bucket_metadata
        self._client = api_factory.get_api(url.scheme)

        if url.url_string.endswith(url.delimiter):
            # Forces the API to return prefixes instead of their contents.
            url = storage_url.storage_url_from_string(
                storage_url.rstrip_one_delimiter(url.url_string))
Exemplo n.º 3
0
    def _get_destination_suffix_for_recursion(self, destination_container,
                                              source):
        """Returns the suffix required to complete the destination URL.

    Let's assume the following:
      User command => cp -r */base_dir gs://dest/existing_prefix
      source.resource.storage_url => a/base_dir/c/d.txt
      source.expanded_url => a/base_dir
      destination_container.storage_url => gs://dest/existing_prefix

    If the destination container exists, the entire directory gets copied:
    Result => gs://dest/existing_prefix/base_dir/c/d.txt

    On the other hand, if the destination container does not exist, the
    top-level dir does not get copied over.
    Result => gs://dest/existing_prefix/c/d.txt

    Args:
      destination_container (resource_reference.Resource): The destination
        container.
      source (NameExpansionResult): Represents the source resource and the
        expanded parent url in case of recursion.

    Returns:
      (str) The suffix to be appended to the destination container.
    """
        source_prefix_to_ignore = storage_url.rstrip_one_delimiter(
            source.expanded_url.versionless_url_string,
            source.expanded_url.delimiter)
        if (not isinstance(destination_container,
                           resource_reference.UnknownResource)
                and destination_container.is_container()):
            # Destination container exists. This means we need to preserve the
            # top-level source directory.
            # Remove the leaf name so that it gets added to the destination.
            source_prefix_to_ignore = source_prefix_to_ignore.rpartition(
                source.expanded_url.delimiter)[0]
            if not source_prefix_to_ignore:
                # In case of Windows, the source URL might not contain any Windows
                # delimiter if it was a single directory (e.g file://dir) and
                # source_prefix_to_ignore will be empty. Set it to <scheme>://.
                # TODO(b/169093672) This will not be required if we get rid of file://
                source_prefix_to_ignore = source.expanded_url.scheme.value + '://'

        full_source_url = source.resource.storage_url.versionless_url_string
        suffix_for_destination = full_source_url.split(
            source_prefix_to_ignore)[1]

        # Windows uses \ as a delimiter. Force the suffix to use the same
        # delimiter used by the destination container.
        source_delimiter = source.resource.storage_url.delimiter
        destination_delimiter = destination_container.storage_url.delimiter
        if source_delimiter != destination_delimiter:
            return suffix_for_destination.replace(source_delimiter,
                                                  destination_delimiter)
        return suffix_for_destination
Exemplo n.º 4
0
    def _filter_resources(self, resource_iterator, wildcard_pattern):
        """Filter out resources that do not match the wildcard_pattern.

    Args:
      resource_iterator (iterable): An iterable resource_reference.Resource
        objects.
      wildcard_pattern (str): The wildcard_pattern to filter the resources.

    Yields:
      resource_reference.Resource objects matching the wildcard_pattern
    """
        regex_string = fnmatch.translate(wildcard_pattern)
        regex_pattern = re.compile(regex_string)
        for resource in resource_iterator:
            # A prefix resource returned by the API will always end with a slash.
            # We strip the slash in the end to match cases like gs://bucket/folder1
            object_name = storage_url.rstrip_one_delimiter(
                resource.storage_url.object_name)
            if (self._url.generation and
                    resource.storage_url.generation != self._url.generation):
                # Filter based on generation, if generation is present in the request.
                continue
            if regex_pattern.match(object_name):
                yield resource
Exemplo n.º 5
0
    def __init__(self,
                 url,
                 all_versions=False,
                 fields_scope=cloud_api.FieldsScope.NO_ACL):
        """Instantiates an iterator that matches the wildcard URL.

    Args:
      url (CloudUrl): CloudUrl that may contain wildcard that needs expansion.
      all_versions (bool): If true, the iterator yields all versions of objects
          matching the wildcard.  If false, yields just the live object version.
      fields_scope (cloud_api.FieldsScope): Determines amount of metadata
          returned by API.
    """
        super(CloudWildcardIterator, self).__init__()
        url = _compress_url_wildcards(url)
        self._url = url
        self._all_versions = all_versions
        self._fields_scope = fields_scope
        self._client = api_factory.get_api(url.scheme)

        if url.url_string.endswith(url.delimiter):
            # Forces the API to return prefixes instead of their contents.
            url = storage_url.storage_url_from_string(
                storage_url.rstrip_one_delimiter(url.url_string))
  def _get_destination_suffix_for_recursion(self, destination_container,
                                            source):
    """Returns the suffix required to complete the destination URL.

    Let's assume the following:
      User command => cp -r */base_dir gs://dest/existing_prefix
      source.resource.storage_url => a/base_dir/c/d.txt
      source.expanded_url => a/base_dir
      destination_container.storage_url => gs://dest/existing_prefix

    If the destination container exists, the entire directory gets copied:
    Result => gs://dest/existing_prefix/base_dir/c/d.txt

    Args:
      destination_container (resource_reference.Resource): The destination
        container.
      source (NameExpansionResult): Represents the source resource and the
        expanded parent url in case of recursion.

    Returns:
      (str) The suffix to be appended to the destination container.
    """
    source_prefix_to_ignore = storage_url.rstrip_one_delimiter(
        source.expanded_url.versionless_url_string,
        source.expanded_url.delimiter)

    expanded_url_is_valid_parent = _is_expanded_url_valid_parent_dir(
        source.expanded_url)
    if not expanded_url_is_valid_parent and self._has_multiple_top_level_sources:
      # To avoid top-level name conflicts, we need to copy the parent dir.
      # However, that cannot be done because the parent dir has an invalid name.
      raise errors.InvalidUrlError(
          'Presence of multiple top-level sources and invalid expanded URL'
          ' make file name conflicts possible for URL: {}'.format(
              source.resource))

    is_top_level_source_object_name_conflict_possible = (
        isinstance(destination_container, resource_reference.UnknownResource)
        and self._has_multiple_top_level_sources)
    destination_is_existing_dir = (not isinstance(
        destination_container, resource_reference.UnknownResource) and
                                   destination_container.is_container())
    if is_top_level_source_object_name_conflict_possible or (
        expanded_url_is_valid_parent and destination_is_existing_dir):
      # Preserve the top-level source directory, and remove the leaf name
      # so that it gets added to the destination.
      source_prefix_to_ignore, _, _ = source_prefix_to_ignore.rpartition(
          source.expanded_url.delimiter)
      if not source_prefix_to_ignore:
        # In case of Windows, the source URL might not contain any Windows
        # delimiter if it was a single directory (e.g file://dir) and
        # source_prefix_to_ignore will be empty. Set it to <scheme>://.
        # TODO(b/169093672) This will not be required if we get rid of file://
        source_prefix_to_ignore = source.expanded_url.scheme.value + '://'

    full_source_url = source.resource.storage_url.versionless_url_string
    suffix_for_destination = full_source_url.split(source_prefix_to_ignore)[1]

    # Windows uses \ as a delimiter. Force the suffix to use the same
    # delimiter used by the destination container.
    source_delimiter = source.resource.storage_url.delimiter
    destination_delimiter = destination_container.storage_url.delimiter
    if source_delimiter != destination_delimiter:
      return suffix_for_destination.replace(source_delimiter,
                                            destination_delimiter)
    return suffix_for_destination
Exemplo n.º 7
0
    def _expand_object_path(self, bucket_name):
        """If wildcard, expand object names.

    Recursively expand each folder with wildcard.

    Args:
      bucket_name (str): Name of the bucket.

    Yields:
      resource_reference.Resource objects where each resource can be
      an ObjectResource object or a PrefixResource object.
    """
        # Retain original name to see if user wants only prefixes.
        original_object_name = self._url.object_name
        # Force API to return prefix resource not the prefix's contents.
        object_name = storage_url.rstrip_one_delimiter(original_object_name)

        names_needing_expansion = collections.deque([object_name])
        while names_needing_expansion:
            name = names_needing_expansion.popleft()

            # Parse out the prefix, delimiter, filter_pattern and suffix.
            # Given a string 'a/b*c/d/e*f/g.txt', this will return
            # CloudWildcardParts(prefix='a/b', filter_pattern='*c',
            #                    delimiter='/', suffix='d/e*f/g.txt')
            wildcard_parts = CloudWildcardParts.from_string(
                name, self._url.delimiter)

            # Fetch all the objects and prefixes.
            resource_iterator = self._client.list_objects(
                all_versions=self._all_versions or bool(self._url.generation),
                bucket_name=bucket_name,
                delimiter=wildcard_parts.delimiter,
                fields_scope=self._fields_scope,
                prefix=wildcard_parts.prefix or None)

            # We have all the objects and prefixes that matched the
            # wildcard_parts.prefix. Use the filter_pattern to eliminate non-matching
            # objects and prefixes.
            filtered_resources = self._filter_resources(
                resource_iterator,
                wildcard_parts.prefix + wildcard_parts.filter_pattern)

            for resource in filtered_resources:
                if wildcard_parts.suffix:
                    if isinstance(resource, resource_reference.PrefixResource):
                        # Suffix is present, which indicates that we have more wildcards to
                        # expand. Let's say object_name is a/b1c. Then the new string that
                        # we want to expand will be a/b1c/d/e*f/g.txt
                        names_needing_expansion.append(
                            resource.storage_url.object_name +
                            wildcard_parts.suffix)
                else:
                    # Make sure an object is not returned if the original query was for
                    # a prefix.
                    if (not resource.storage_url.object_name.endswith(
                            self._url.delimiter)
                            and original_object_name.endswith(
                                self._url.delimiter)):
                        continue
                    yield resource