def testPluralityCheckableIteratorWith1Elem1Exception(self): """Tests PluralityCheckableIterator with 2 elements. The second element raises an exception. """ class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position == 0: self.position += 1 return 1 elif self.position == 1: self.position += 1 raise CustomTestException('Test exception') else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) self.assertFalse(pcit.IsEmpty()) self.assertTrue(pcit.HasPlurality()) iterated_value = None try: for value in pcit: iterated_value = value self.fail('Expected exception from iterator') except CustomTestException: pass self.assertEqual(iterated_value, 1)
def _GetIam(self, thread_state=None): """Gets IAM policy for single bucket or object.""" pattern = self.args[0] matches = PluralityCheckableIterator( self.WildcardIterator(pattern).IterAll( bucket_listing_fields=['name'])) if matches.IsEmpty(): raise CommandException('%s matched no URLs' % pattern) if matches.HasPlurality(): raise CommandException( '%s matched more than one URL, which is not allowed by the %s ' 'command' % (pattern, self.command_name)) storage_url = StorageUrlFromString(list(matches)[0].url_string) policy = self.GetIamHelper(storage_url, thread_state=thread_state) policy_json = json.loads(protojson.encode_message(policy)) policy_str = json.dumps( policy_json, sort_keys=True, separators=(',', ': '), indent=2, ) print(policy_str)
def NameExpansionIterator(command_name, debug, logger, gsutil_api, url_strs, recursion_requested, all_versions=False, cmd_supports_recursion=True, project_id=None, continue_on_error=False): """Static factory function for instantiating _NameExpansionIterator. This wraps the resulting iterator in a PluralityCheckableIterator and checks that it is non-empty. Also, allows url_strs to be either an array or an iterator. Args: command_name: name of command being run. debug: Debug level to pass to underlying iterators (range 0..3). logger: logging.Logger object. gsutil_api: Cloud storage interface. Settable for testing/mocking. url_strs: Iterable URL strings needing expansion. recursion_requested: True if -r specified on command-line. If so, listings will be flattened so mapped-to results contain objects spanning subdirectories. all_versions: Bool indicating whether to iterate over all object versions. cmd_supports_recursion: Bool indicating whether this command supports a '-r' flag. Useful for printing helpful error messages. project_id: Project id to use for the current command. continue_on_error: If true, yield no-match exceptions encountered during iteration instead of raising them. Raises: CommandException if underlying iterator is empty. Returns: Name expansion iterator instance. For example semantics, see comments in NameExpansionIterator.__init__. """ url_strs = PluralityCheckableIterator(url_strs) name_expansion_iterator = _NameExpansionIterator( command_name, debug, logger, gsutil_api, url_strs, recursion_requested, all_versions=all_versions, cmd_supports_recursion=cmd_supports_recursion, project_id=project_id, continue_on_error=continue_on_error) name_expansion_iterator = PluralityCheckableIterator( name_expansion_iterator) if name_expansion_iterator.IsEmpty(): raise CommandException('No URLs matched') return name_expansion_iterator
def testPluralityCheckableIteratorWith3Elems(self): """Tests PluralityCheckableIterator with 3 elements.""" input_list = range(3) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertFalse(pcit.IsEmpty()) self.assertTrue(pcit.HasPlurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def testPluralityCheckableIteratorWith0Elems(self): """Tests empty PluralityCheckableIterator.""" input_list = list(range(0)) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertTrue(pcit.IsEmpty()) self.assertFalse(pcit.HasPlurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def _GetIam(self, pattern, thread_state=None): """Gets IAM policy for single bucket or object.""" matches = PluralityCheckableIterator( self.WildcardIterator(pattern).IterAll( bucket_listing_fields=['name'])) if matches.IsEmpty(): raise CommandException('%s matched no URLs' % pattern) if matches.HasPlurality(): raise CommandException( '%s matched more than one URL, which is not allowed by the %s ' 'command' % (pattern, self.command_name)) storage_url = StorageUrlFromString(list(matches)[0].url_string) return self.GetIamHelper(storage_url, thread_state=thread_state)
def __iter__(self): for blr in self.blr_iter: if blr.IsPrefix(): # This is a bucket subdirectory, list objects according to the wildcard. prefix_url = StorageUrlFromString(blr.url_string).CreatePrefixUrl( wildcard_suffix=self.subdir_exp_wildcard) implicit_subdir_iterator = PluralityCheckableIterator( self.name_exp_instance.WildcardIterator(prefix_url).IterAll( bucket_listing_fields=self.bucket_listing_fields)) if not implicit_subdir_iterator.IsEmpty(): for exp_blr in implicit_subdir_iterator: yield (True, exp_blr) else: # Prefix that contains no objects, for example in the $folder$ case # or an empty filesystem directory. yield (False, blr) elif blr.IsObject(): yield (False, blr) else: raise CommandException( '_ImplicitBucketSubdirIterator got a bucket reference %s' % blr)
def testPluralityCheckableIteratorReadsAheadAsNeeded(self): """Tests that the PCI does not unnecessarily read new elements.""" class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position == 3: raise StopIteration() self.position += 1 # IsEmpty and PeekException should retrieve only 1 element from the # underlying iterator. pcit = PluralityCheckableIterator(IterTest()) pcit.IsEmpty() pcit.PeekException() self.assertEquals(pcit.orig_iterator.position, 1) # HasPlurality requires populating 2 elements into the iterator. pcit.HasPlurality() self.assertEquals(pcit.orig_iterator.position, 2) # next should yield already-populated elements without advancing the # iterator. pcit.next() # Yields element 1 self.assertEquals(pcit.orig_iterator.position, 2) pcit.next() # Yields element 2 self.assertEquals(pcit.orig_iterator.position, 2) pcit.next() # Yields element 3 self.assertEquals(pcit.orig_iterator.position, 3) try: pcit.next() # Underlying iterator is empty self.fail('Expected StopIteration') except StopIteration: pass
def __iter__(self): """Iterates over all source URLs passed to the iterator. For each src url, expands wildcards, object-less bucket names, subdir bucket names, and directory names, and generates a flat listing of all the matching objects/files. You should instantiate this object using the static factory function NameExpansionIterator, because consumers of this iterator need the PluralityCheckableIterator wrapper built by that function. Yields: gslib.name_expansion.NameExpansionResult. Raises: CommandException: if errors encountered. """ for url_str in self.url_strs: storage_url = StorageUrlFromString(url_str) if storage_url.IsFileUrl() and storage_url.IsStream(): if self.url_strs.has_plurality: raise CommandException( 'Multiple URL strings are not supported ' 'with streaming ("-") URLs.') yield NameExpansionResult(storage_url, False, False, storage_url) continue # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd src_names_bucket = False if (storage_url.IsCloudUrl() and storage_url.IsBucket() and not self.recursion_requested): # UNIX commands like rm and cp will omit directory references. # If url_str refers only to buckets and we are not recursing, # then produce references of type BUCKET, because they are guaranteed # to pass through Step 2 and be omitted in Step 3. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterBuckets( bucket_fields=['id'])) else: # Get a list of objects and prefixes, expanding the top level for # any listed buckets. If our source is a bucket, however, we need # to treat all of the top level expansions as names_container=True. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterAll( bucket_listing_fields=['name'], expand_top_level_buckets=True)) if storage_url.IsCloudUrl() and storage_url.IsBucket(): src_names_bucket = True # Step 2: Expand bucket subdirs. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). subdir_exp_wildcard = self._flatness_wildcard[ self.recursion_requested] if self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator( self, post_step1_iter, subdir_exp_wildcard) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Because we actually perform and check object listings here, this will # raise if url_args includes a non-existent object. However, # plurality_checkable_iterator will buffer the exception for us, not # raising it until the iterator is actually asked to yield the first # result. if post_step2_iter.IsEmpty(): if self.continue_on_error: try: raise CommandException('No URLs matched: %s' % url_str) except CommandException, e: # Yield a specialized tuple of (exception, stack_trace) to # the wrapping PluralityCheckableIterator. yield (e, sys.exc_info()[2]) else: raise CommandException('No URLs matched: %s' % url_str) # Step 3. Omit any directories, buckets, or bucket subdirectories for # non-recursive expansions. post_step3_iter = PluralityCheckableIterator( _OmitNonRecursiveIterator(post_step2_iter, self.recursion_requested, self.command_name, self.cmd_supports_recursion, self.logger)) src_url_expands_to_multi = post_step3_iter.HasPlurality() is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # Step 4. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] for (names_container, blr) in post_step3_iter: src_names_container = src_names_bucket or names_container if blr.IsObject(): yield NameExpansionResult(storage_url, is_multi_source_request, src_names_container, blr.storage_url) else: # Use implicit wildcarding to do the enumeration. # At this point we are guaranteed that: # - Recursion has been requested because non-object entries are # filtered in step 3 otherwise. # - This is a prefix or bucket subdirectory because only # non-recursive iterations product bucket references. expanded_url = StorageUrlFromString(blr.url_string) if expanded_url.IsFileUrl(): # Convert dir to implicit recursive wildcard. url_to_iterate = '%s%s%s' % (blr, os.sep, subdir_exp_wildcard) else: # Convert subdir to implicit recursive wildcard. url_to_iterate = expanded_url.CreatePrefixUrl( wildcard_suffix=subdir_exp_wildcard) wc_iter = PluralityCheckableIterator( self.WildcardIterator(url_to_iterate).IterObjects( bucket_listing_fields=['name'])) src_url_expands_to_multi = (src_url_expands_to_multi or wc_iter.HasPlurality()) is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # This will be a flattened listing of all underlying objects in the # subdir. for blr in wc_iter: yield NameExpansionResult(storage_url, is_multi_source_request, True, blr.storage_url)
class _DiffIterator(object): """Iterator yielding sequence of _DiffToApply objects.""" def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_file_checksums = command_obj.compute_file_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') _tmp_files.append(self.sorted_list_src_file_name) (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') _tmp_files.append(self.sorted_list_dst_file_name) # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc) # where base_url_str is the starting URL string for listing. args_iter = iter([ (self.base_src_url.url_string, self.sorted_list_src_file_name, 'source'), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, 'destination') ]) # Contains error message from non-retryable listing failure. command_obj.non_retryable_listing_failures = 0 shared_attrs = ['non_retryable_listing_failures'] command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) if command_obj.non_retryable_listing_failures: raise CommandException('Caught non-retryable exception - aborting rsync') self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file)) def _ParseTmpFileLine(self, line): """Parses output from _BuildTmpOutputLine. Parses into tuple: (URL, size, crc32c, md5) where crc32c and/or md5 can be _NA. Args: line: The line to parse. Returns: Parsed tuple: (url, size, crc32c, md5) """ (encoded_url, size, crc32c, md5) = line.split() return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip()) def _WarnIfMissingCloudHash(self, url_str, crc32c, md5): """Warns if given url_str is a cloud URL and is missing both crc32c and md5. Args: url_str: Destination URL string. crc32c: Destination CRC32c. md5: Destination MD5. Returns: True if issued warning. """ # One known way this can currently happen is when rsync'ing objects larger # than 5 GB from S3 (for which the etag is not an MD5). if (StorageUrlFromString(url_str).IsCloudUrl() and crc32c == _NA and md5 == _NA): self.logger.warn( 'Found no hashes to validate %s. Integrity cannot be assured without ' 'hashes.', url_str) return True return False def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): """Returns True if src and dst objects are the same. Uses size plus whatever checksums are available. Args: src_url_str: Source URL string. src_size: Source size src_crc32c: Source CRC32c. src_md5: Source MD5. dst_url_str: Destination URL string. dst_size: Destination size dst_crc32c: Destination CRC32c. dst_md5: Destination MD5. Returns: True/False. """ # Note: This function is called from __iter__, which is called from the # Command.Apply driver. Thus, all checksum computation will be run in a # single thread, which is good (having multiple threads concurrently # computing checksums would thrash the disk). if src_size != dst_size: return False if self.compute_file_checksums: (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums( self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5) if src_md5 != _NA and dst_md5 != _NA: self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str) return src_md5 == dst_md5 if src_crc32c != _NA and dst_crc32c != _NA: self.logger.debug( 'Comparing crc32c for %s and %s', src_url_str, dst_url_str) return src_crc32c == dst_crc32c if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5): self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5) # Without checksums to compare we depend only on basic size comparison. return True def __iter__(self): """Iterates over src/dst URLs and produces a _DiffToApply sequence. Yields: The _DiffToApply. """ # Strip trailing slashes, if any, so we compute tail length against # consistent position regardless of whether trailing slashes were included # or not in URL. base_src_url_len = len(self.base_src_url.url_string.rstrip('/\\')) base_dst_url_len = len(self.base_dst_url.url_string.rstrip('/\\')) src_url_str = dst_url_str = None # Invariant: After each yield, the URLs in src_url_str, dst_url_str, # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet # processed. Each time we encounter None in src_url_str or dst_url_str we # populate from the respective iterator, and we reset one or the other value # to None after yielding an action that disposes of that URL. while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None: if src_url_str is None: (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine( self.sorted_src_urls_it.next()) # Skip past base URL and normalize slashes so we can compare across # clouds/file systems (including Windows). src_url_str_to_check = _EncodeUrl( src_url_str[base_src_url_len:].replace('\\', '/')) dst_url_str_would_copy_to = copy_helper.ConstructDstUrl( self.base_src_url, StorageUrlFromString(src_url_str), True, True, self.base_dst_url, False, self.recursion_requested).url_string if self.sorted_dst_urls_it.IsEmpty(): # We've reached end of dst URLs, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None continue if not dst_url_str: (dst_url_str, dst_size, dst_crc32c, dst_md5) = ( self._ParseTmpFileLine(self.sorted_dst_urls_it.next())) # Skip past base URL and normalize slashes so we can compare acros # clouds/file systems (including Windows). dst_url_str_to_check = _EncodeUrl( dst_url_str[base_dst_url_len:].replace('\\', '/')) if src_url_str_to_check < dst_url_str_to_check: # There's no dst object corresponding to src object, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None elif src_url_str_to_check > dst_url_str_to_check: # dst object without a corresponding src object, so remove dst if -d # option was specified. if self.delete_extras: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None else: # There is a dst object corresponding to src object, so check if objects # match. if self._ObjectsMatch( src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): # Continue iterating without yielding a _DiffToApply. pass else: yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY) src_url_str = None dst_url_str = None # If -d option specified any files/objects left in dst iteration should be # removed. if not self.delete_extras: return if dst_url_str: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None for line in self.sorted_dst_urls_it: (dst_url_str, _, _, _) = self._ParseTmpFileLine(line) yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)