def testPluralityCheckableIteratorWith1Elem1Exception(self):
        """Tests PluralityCheckableIterator with 2 elements.

    The second element raises an exception.
    """
        class IterTest(object):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def next(self):
                if self.position == 0:
                    self.position += 1
                    return 1
                elif self.position == 1:
                    self.position += 1
                    raise CustomTestException('Test exception')
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        self.assertFalse(pcit.IsEmpty())
        self.assertTrue(pcit.HasPlurality())
        iterated_value = None
        try:
            for value in pcit:
                iterated_value = value
            self.fail('Expected exception from iterator')
        except CustomTestException:
            pass
        self.assertEqual(iterated_value, 1)
Пример #2
0
    def _GetIam(self, thread_state=None):
        """Gets IAM policy for single bucket or object."""

        pattern = self.args[0]

        matches = PluralityCheckableIterator(
            self.WildcardIterator(pattern).IterAll(
                bucket_listing_fields=['name']))
        if matches.IsEmpty():
            raise CommandException('%s matched no URLs' % pattern)
        if matches.HasPlurality():
            raise CommandException(
                '%s matched more than one URL, which is not allowed by the %s '
                'command' % (pattern, self.command_name))

        storage_url = StorageUrlFromString(list(matches)[0].url_string)
        policy = self.GetIamHelper(storage_url, thread_state=thread_state)
        policy_json = json.loads(protojson.encode_message(policy))
        policy_str = json.dumps(
            policy_json,
            sort_keys=True,
            separators=(',', ': '),
            indent=2,
        )
        print(policy_str)
Пример #3
0
def NameExpansionIterator(command_name,
                          debug,
                          logger,
                          gsutil_api,
                          url_strs,
                          recursion_requested,
                          all_versions=False,
                          cmd_supports_recursion=True,
                          project_id=None,
                          continue_on_error=False):
    """Static factory function for instantiating _NameExpansionIterator.

  This wraps the resulting iterator in a PluralityCheckableIterator and checks
  that it is non-empty. Also, allows url_strs to be either an array or an
  iterator.

  Args:
    command_name: name of command being run.
    debug: Debug level to pass to underlying iterators (range 0..3).
    logger: logging.Logger object.
    gsutil_api: Cloud storage interface.  Settable for testing/mocking.
    url_strs: Iterable URL strings needing expansion.
    recursion_requested: True if -r specified on command-line.  If so,
        listings will be flattened so mapped-to results contain objects
        spanning subdirectories.
    all_versions: Bool indicating whether to iterate over all object versions.
    cmd_supports_recursion: Bool indicating whether this command supports a '-r'
        flag. Useful for printing helpful error messages.
    project_id: Project id to use for the current command.
    continue_on_error: If true, yield no-match exceptions encountered during
                       iteration instead of raising them.

  Raises:
    CommandException if underlying iterator is empty.

  Returns:
    Name expansion iterator instance.

  For example semantics, see comments in NameExpansionIterator.__init__.
  """
    url_strs = PluralityCheckableIterator(url_strs)
    name_expansion_iterator = _NameExpansionIterator(
        command_name,
        debug,
        logger,
        gsutil_api,
        url_strs,
        recursion_requested,
        all_versions=all_versions,
        cmd_supports_recursion=cmd_supports_recursion,
        project_id=project_id,
        continue_on_error=continue_on_error)
    name_expansion_iterator = PluralityCheckableIterator(
        name_expansion_iterator)
    if name_expansion_iterator.IsEmpty():
        raise CommandException('No URLs matched')
    return name_expansion_iterator
 def testPluralityCheckableIteratorWith3Elems(self):
     """Tests PluralityCheckableIterator with 3 elements."""
     input_list = range(3)
     it = iter(input_list)
     pcit = PluralityCheckableIterator(it)
     self.assertFalse(pcit.IsEmpty())
     self.assertTrue(pcit.HasPlurality())
     output_list = list(pcit)
     self.assertEqual(input_list, output_list)
 def testPluralityCheckableIteratorWith0Elems(self):
     """Tests empty PluralityCheckableIterator."""
     input_list = list(range(0))
     it = iter(input_list)
     pcit = PluralityCheckableIterator(it)
     self.assertTrue(pcit.IsEmpty())
     self.assertFalse(pcit.HasPlurality())
     output_list = list(pcit)
     self.assertEqual(input_list, output_list)
Пример #6
0
    def _GetIam(self, pattern, thread_state=None):
        """Gets IAM policy for single bucket or object."""

        matches = PluralityCheckableIterator(
            self.WildcardIterator(pattern).IterAll(
                bucket_listing_fields=['name']))
        if matches.IsEmpty():
            raise CommandException('%s matched no URLs' % pattern)
        if matches.HasPlurality():
            raise CommandException(
                '%s matched more than one URL, which is not allowed by the %s '
                'command' % (pattern, self.command_name))

        storage_url = StorageUrlFromString(list(matches)[0].url_string)
        return self.GetIamHelper(storage_url, thread_state=thread_state)
Пример #7
0
 def __iter__(self):
   for blr in self.blr_iter:
     if blr.IsPrefix():
       # This is a bucket subdirectory, list objects according to the wildcard.
       prefix_url = StorageUrlFromString(blr.url_string).CreatePrefixUrl(
           wildcard_suffix=self.subdir_exp_wildcard)
       implicit_subdir_iterator = PluralityCheckableIterator(
           self.name_exp_instance.WildcardIterator(prefix_url).IterAll(
               bucket_listing_fields=self.bucket_listing_fields))
       if not implicit_subdir_iterator.IsEmpty():
         for exp_blr in implicit_subdir_iterator:
           yield (True, exp_blr)
       else:
         # Prefix that contains no objects, for example in the $folder$ case
         # or an empty filesystem directory.
         yield (False, blr)
     elif blr.IsObject():
       yield (False, blr)
     else:
       raise CommandException(
           '_ImplicitBucketSubdirIterator got a bucket reference %s' % blr)
Пример #8
0
    def testPluralityCheckableIteratorReadsAheadAsNeeded(self):
        """Tests that the PCI does not unnecessarily read new elements."""
        class IterTest(object):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def next(self):
                if self.position == 3:
                    raise StopIteration()
                self.position += 1

        # IsEmpty and PeekException should retrieve only 1 element from the
        # underlying iterator.
        pcit = PluralityCheckableIterator(IterTest())
        pcit.IsEmpty()
        pcit.PeekException()
        self.assertEquals(pcit.orig_iterator.position, 1)
        # HasPlurality requires populating 2 elements into the iterator.
        pcit.HasPlurality()
        self.assertEquals(pcit.orig_iterator.position, 2)
        # next should yield already-populated elements without advancing the
        # iterator.
        pcit.next()  # Yields element 1
        self.assertEquals(pcit.orig_iterator.position, 2)
        pcit.next()  # Yields element 2
        self.assertEquals(pcit.orig_iterator.position, 2)
        pcit.next()  # Yields element 3
        self.assertEquals(pcit.orig_iterator.position, 3)
        try:
            pcit.next()  # Underlying iterator is empty
            self.fail('Expected StopIteration')
        except StopIteration:
            pass
Пример #9
0
    def __iter__(self):
        """Iterates over all source URLs passed to the iterator.

    For each src url, expands wildcards, object-less bucket names,
    subdir bucket names, and directory names, and generates a flat listing of
    all the matching objects/files.

    You should instantiate this object using the static factory function
    NameExpansionIterator, because consumers of this iterator need the
    PluralityCheckableIterator wrapper built by that function.

    Yields:
      gslib.name_expansion.NameExpansionResult.

    Raises:
      CommandException: if errors encountered.
    """
        for url_str in self.url_strs:
            storage_url = StorageUrlFromString(url_str)

            if storage_url.IsFileUrl() and storage_url.IsStream():
                if self.url_strs.has_plurality:
                    raise CommandException(
                        'Multiple URL strings are not supported '
                        'with streaming ("-") URLs.')
                yield NameExpansionResult(storage_url, False, False,
                                          storage_url)
                continue

            # Step 1: Expand any explicitly specified wildcards. The output from this
            # step is an iterator of BucketListingRef.
            # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd

            src_names_bucket = False
            if (storage_url.IsCloudUrl() and storage_url.IsBucket()
                    and not self.recursion_requested):
                # UNIX commands like rm and cp will omit directory references.
                # If url_str refers only to buckets and we are not recursing,
                # then produce references of type BUCKET, because they are guaranteed
                # to pass through Step 2 and be omitted in Step 3.
                post_step1_iter = PluralityCheckableIterator(
                    self.WildcardIterator(url_str).IterBuckets(
                        bucket_fields=['id']))
            else:
                # Get a list of objects and prefixes, expanding the top level for
                # any listed buckets.  If our source is a bucket, however, we need
                # to treat all of the top level expansions as names_container=True.
                post_step1_iter = PluralityCheckableIterator(
                    self.WildcardIterator(url_str).IterAll(
                        bucket_listing_fields=['name'],
                        expand_top_level_buckets=True))
                if storage_url.IsCloudUrl() and storage_url.IsBucket():
                    src_names_bucket = True

            # Step 2: Expand bucket subdirs. The output from this
            # step is an iterator of (names_container, BucketListingRef).
            # Starting with gs://bucket/abcd this step would expand to:
            #   iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
            subdir_exp_wildcard = self._flatness_wildcard[
                self.recursion_requested]
            if self.recursion_requested:
                post_step2_iter = _ImplicitBucketSubdirIterator(
                    self, post_step1_iter, subdir_exp_wildcard)
            else:
                post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
            post_step2_iter = PluralityCheckableIterator(post_step2_iter)

            # Because we actually perform and check object listings here, this will
            # raise if url_args includes a non-existent object.  However,
            # plurality_checkable_iterator will buffer the exception for us, not
            # raising it until the iterator is actually asked to yield the first
            # result.
            if post_step2_iter.IsEmpty():
                if self.continue_on_error:
                    try:
                        raise CommandException('No URLs matched: %s' % url_str)
                    except CommandException, e:
                        # Yield a specialized tuple of (exception, stack_trace) to
                        # the wrapping PluralityCheckableIterator.
                        yield (e, sys.exc_info()[2])
                else:
                    raise CommandException('No URLs matched: %s' % url_str)

            # Step 3. Omit any directories, buckets, or bucket subdirectories for
            # non-recursive expansions.
            post_step3_iter = PluralityCheckableIterator(
                _OmitNonRecursiveIterator(post_step2_iter,
                                          self.recursion_requested,
                                          self.command_name,
                                          self.cmd_supports_recursion,
                                          self.logger))

            src_url_expands_to_multi = post_step3_iter.HasPlurality()
            is_multi_source_request = (self.url_strs.has_plurality
                                       or src_url_expands_to_multi)

            # Step 4. Expand directories and buckets. This step yields the iterated
            # values. Starting with gs://bucket this step would expand to:
            #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
            # Starting with file://dir this step would expand to:
            #  [dir/a.txt, dir/b.txt, dir/c/]
            for (names_container, blr) in post_step3_iter:
                src_names_container = src_names_bucket or names_container

                if blr.IsObject():
                    yield NameExpansionResult(storage_url,
                                              is_multi_source_request,
                                              src_names_container,
                                              blr.storage_url)
                else:
                    # Use implicit wildcarding to do the enumeration.
                    # At this point we are guaranteed that:
                    # - Recursion has been requested because non-object entries are
                    #   filtered in step 3 otherwise.
                    # - This is a prefix or bucket subdirectory because only
                    #   non-recursive iterations product bucket references.
                    expanded_url = StorageUrlFromString(blr.url_string)
                    if expanded_url.IsFileUrl():
                        # Convert dir to implicit recursive wildcard.
                        url_to_iterate = '%s%s%s' % (blr, os.sep,
                                                     subdir_exp_wildcard)
                    else:
                        # Convert subdir to implicit recursive wildcard.
                        url_to_iterate = expanded_url.CreatePrefixUrl(
                            wildcard_suffix=subdir_exp_wildcard)

                    wc_iter = PluralityCheckableIterator(
                        self.WildcardIterator(url_to_iterate).IterObjects(
                            bucket_listing_fields=['name']))
                    src_url_expands_to_multi = (src_url_expands_to_multi
                                                or wc_iter.HasPlurality())
                    is_multi_source_request = (self.url_strs.has_plurality
                                               or src_url_expands_to_multi)
                    # This will be a flattened listing of all underlying objects in the
                    # subdir.
                    for blr in wc_iter:
                        yield NameExpansionResult(storage_url,
                                                  is_multi_source_request,
                                                  True, blr.storage_url)
Пример #10
0
class _DiffIterator(object):
  """Iterator yielding sequence of _DiffToApply objects."""

  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_file_checksums = command_obj.compute_file_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    _tmp_files.append(self.sorted_list_src_file_name)
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    _tmp_files.append(self.sorted_list_dst_file_name)
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
    # where base_url_str is the starting URL string for listing.
    args_iter = iter([
        (self.base_src_url.url_string, self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.url_string, self.sorted_list_dst_file_name,
         'destination')
    ])

    # Contains error message from non-retryable listing failure.
    command_obj.non_retryable_listing_failures = 0
    shared_attrs = ['non_retryable_listing_failures']
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      shared_attrs, arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      fail_on_error=True)

    if command_obj.non_retryable_listing_failures:
      raise CommandException('Caught non-retryable exception - aborting rsync')

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))

  def _ParseTmpFileLine(self, line):
    """Parses output from _BuildTmpOutputLine.

    Parses into tuple:
      (URL, size, crc32c, md5)
    where crc32c and/or md5 can be _NA.

    Args:
      line: The line to parse.

    Returns:
      Parsed tuple: (url, size, crc32c, md5)
    """
    (encoded_url, size, crc32c, md5) = line.split()
    return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip())

  def _WarnIfMissingCloudHash(self, url_str, crc32c, md5):
    """Warns if given url_str is a cloud URL and is missing both crc32c and md5.

    Args:
      url_str: Destination URL string.
      crc32c: Destination CRC32c.
      md5: Destination MD5.

    Returns:
      True if issued warning.
    """
    # One known way this can currently happen is when rsync'ing objects larger
    # than 5 GB from S3 (for which the etag is not an MD5).
    if (StorageUrlFromString(url_str).IsCloudUrl()
        and crc32c == _NA and md5 == _NA):
      self.logger.warn(
          'Found no hashes to validate %s. Integrity cannot be assured without '
          'hashes.', url_str)
      return True
    return False

  def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5,
                    dst_url_str, dst_size, dst_crc32c, dst_md5):
    """Returns True if src and dst objects are the same.

    Uses size plus whatever checksums are available.

    Args:
      src_url_str: Source URL string.
      src_size: Source size
      src_crc32c: Source CRC32c.
      src_md5: Source MD5.
      dst_url_str: Destination URL string.
      dst_size: Destination size
      dst_crc32c: Destination CRC32c.
      dst_md5: Destination MD5.

    Returns:
      True/False.
    """
    # Note: This function is called from __iter__, which is called from the
    # Command.Apply driver. Thus, all checksum computation will be run in a
    # single thread, which is good (having multiple threads concurrently
    # computing checksums would thrash the disk).
    if src_size != dst_size:
      return False
    if self.compute_file_checksums:
      (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums(
          self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str,
          dst_size, dst_crc32c, dst_md5)
    if src_md5 != _NA and dst_md5 != _NA:
      self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str)
      return src_md5 == dst_md5
    if src_crc32c != _NA and dst_crc32c != _NA:
      self.logger.debug(
          'Comparing crc32c for %s and %s', src_url_str, dst_url_str)
      return src_crc32c == dst_crc32c
    if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5):
      self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5)
    # Without checksums to compare we depend only on basic size comparison.
    return True

  def __iter__(self):
    """Iterates over src/dst URLs and produces a _DiffToApply sequence.

    Yields:
      The _DiffToApply.
    """
    # Strip trailing slashes, if any, so we compute tail length against
    # consistent position regardless of whether trailing slashes were included
    # or not in URL.
    base_src_url_len = len(self.base_src_url.url_string.rstrip('/\\'))
    base_dst_url_len = len(self.base_dst_url.url_string.rstrip('/\\'))
    src_url_str = dst_url_str = None
    # Invariant: After each yield, the URLs in src_url_str, dst_url_str,
    # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet
    # processed. Each time we encounter None in src_url_str or dst_url_str we
    # populate from the respective iterator, and we reset one or the other value
    # to None after yielding an action that disposes of that URL.
    while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None:
      if src_url_str is None:
        (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(
            self.sorted_src_urls_it.next())
        # Skip past base URL and normalize slashes so we can compare across
        # clouds/file systems (including Windows).
        src_url_str_to_check = _EncodeUrl(
            src_url_str[base_src_url_len:].replace('\\', '/'))
        dst_url_str_would_copy_to = copy_helper.ConstructDstUrl(
            self.base_src_url, StorageUrlFromString(src_url_str), True, True,
            self.base_dst_url, False, self.recursion_requested).url_string
      if self.sorted_dst_urls_it.IsEmpty():
        # We've reached end of dst URLs, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
        continue
      if not dst_url_str:
        (dst_url_str, dst_size, dst_crc32c, dst_md5) = (
            self._ParseTmpFileLine(self.sorted_dst_urls_it.next()))
        # Skip past base URL and normalize slashes so we can compare acros
        # clouds/file systems (including Windows).
        dst_url_str_to_check = _EncodeUrl(
            dst_url_str[base_dst_url_len:].replace('\\', '/'))

      if src_url_str_to_check < dst_url_str_to_check:
        # There's no dst object corresponding to src object, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
      elif src_url_str_to_check > dst_url_str_to_check:
        # dst object without a corresponding src object, so remove dst if -d
        # option was specified.
        if self.delete_extras:
          yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
        dst_url_str = None
      else:
        # There is a dst object corresponding to src object, so check if objects
        # match.
        if self._ObjectsMatch(
            src_url_str, src_size, src_crc32c, src_md5,
            dst_url_str, dst_size, dst_crc32c, dst_md5):
          # Continue iterating without yielding a _DiffToApply.
          pass
        else:
          yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY)
        src_url_str = None
        dst_url_str = None

    # If -d option specified any files/objects left in dst iteration should be
    # removed.
    if not self.delete_extras:
      return
    if dst_url_str:
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
      dst_url_str = None
    for line in self.sorted_dst_urls_it:
      (dst_url_str, _, _, _) = self._ParseTmpFileLine(line)
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)