def __call__(self, prefix, **kwargs): if not prefix: prefix = 'gs://' elif IsFileUrlString(prefix): return [] wildcard_url = prefix + '*' url = StorageUrlFromString(wildcard_url) if self._bucket_only and not url.IsBucket(): return [] timeout = boto.config.getint('GSUtil', 'tab_completion_timeout', 5) if timeout == 0: return [] start_time = time.time() cache = TabCompletionCache.LoadFromFile( GetTabCompletionCacheFilename()) cached_results = cache.GetCachedResults(prefix) timing_log_entry_type = '' if cached_results is not None: results = cached_results timing_log_entry_type = ' (from cache)' else: try: results = self._PerformCloudListing(wildcard_url, timeout) if self._bucket_only and len(results) == 1: results = [StripOneSlash(results[0])] partial_results = (len(results) == _TAB_COMPLETE_MAX_RESULTS) cache.UpdateCache(prefix, results, partial_results) except TimeoutError: timing_log_entry_type = ' (request timeout)' results = [] cache.WriteToFile(GetTabCompletionCacheFilename()) end_time = time.time() num_results = len(results) elapsed_seconds = end_time - start_time _WriteTimingLog( '%s results%s in %.2fs, %.2f results/second for prefix: %s\n' % (num_results, timing_log_entry_type, elapsed_seconds, num_results / elapsed_seconds, prefix)) return results
def _BuildBucketFilterStrings(self, wildcard): """Builds strings needed for querying a bucket and filtering results. This implements wildcard object name matching. Args: wildcard: The wildcard string to match to objects. Returns: (prefix, delimiter, prefix_wildcard, suffix_wildcard) where: prefix is the prefix to be sent in bucket GET request. delimiter is the delimiter to be sent in bucket GET request. prefix_wildcard is the wildcard to be used to filter bucket GET results. suffix_wildcard is wildcard to be appended to filtered bucket GET results for next wildcard expansion iteration. For example, given the wildcard gs://bucket/abc/d*e/f*.txt we would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket listing request will then produce a listing result set that can be filtered using this prefix_wildcard; and we'd use this suffix_wildcard to feed into the next call(s) to _BuildBucketFilterStrings(), for the next iteration of listing/filtering. Raises: AssertionError if wildcard doesn't contain any wildcard chars. """ # Generate a request prefix if the object name part of the wildcard starts # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). match = WILDCARD_REGEX.search(wildcard) if not match: # Input "wildcard" has no wildcard chars, so just return tuple that will # cause a bucket listing to match the given input wildcard. Example: if # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, # the next iteration will call _BuildBucketFilterStrings() with # gs://bucket/dir/abc, and we will return prefix ='dir/abc', # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. prefix = wildcard delimiter = '/' prefix_wildcard = wildcard suffix_wildcard = '' else: if match.start() > 0: # Wildcard does not occur at beginning of object name, so construct a # prefix string to send to server. prefix = wildcard[:match.start()] wildcard_part = wildcard[match.start():] else: prefix = None wildcard_part = wildcard end = wildcard_part.find('/') if end != -1: wildcard_part = wildcard_part[:end + 1] # Remove trailing '/' so we will match gs://bucket/abc* as well as # gs://bucket/abc*/ with the same wildcard regex. prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) suffix_wildcard = wildcard[match.end():] end = suffix_wildcard.find('/') if end == -1: suffix_wildcard = '' else: suffix_wildcard = suffix_wildcard[end + 1:] # To implement recursive (**) wildcarding, if prefix_wildcard # suffix_wildcard starts with '**' don't send a delimiter, and combine # suffix_wildcard at end of prefix_wildcard. if prefix_wildcard.find('**') != -1: delimiter = None prefix_wildcard += suffix_wildcard suffix_wildcard = '' else: delimiter = '/' # The following debug output is useful for tracing how the algorithm # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt if self.debug > 1: sys.stderr.write( 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' 'prefix_wildcard=%s, suffix_wildcard=%s\n' % (PrintableStr(wildcard), PrintableStr(prefix), PrintableStr(delimiter), PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard))) return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
def __iter__(self, bucket_listing_fields=None, expand_top_level_buckets=False): """Iterator that gets called when iterating over the cloud wildcard. In the case where no wildcard is present, returns a single matching object, single matching prefix, or one of each if both exist. Args: bucket_listing_fields: Iterable fields to include in bucket listings. Ex. ['name', 'acl']. Iterator is responsible for converting these to list-style format ['items/name', 'items/acl'] as well as adding any fields necessary for listing such as prefixes. API implementation is responsible for adding pagination fields. If this is None, all fields are returned. expand_top_level_buckets: If true, yield no BUCKET references. Instead, expand buckets into top-level objects and prefixes. Yields: BucketListingRef of type BUCKET, OBJECT or PREFIX. """ single_version_request = self.wildcard_url.HasGeneration() # For wildcard expansion purposes, we need at a minimum the name of # each object and prefix. If we're not using the default of requesting # all fields, make sure at least these are requested. The Cloud API # tolerates specifying the same field twice. get_fields = None if bucket_listing_fields: get_fields = set() for field in bucket_listing_fields: get_fields.add(field) bucket_listing_fields = self._GetToListFields( get_fields=bucket_listing_fields) bucket_listing_fields.update(['items/name', 'prefixes']) get_fields.update(['name']) # If we're making versioned requests, ensure generation and # metageneration are also included. if single_version_request or self.all_versions: bucket_listing_fields.update( ['items/generation', 'items/metageneration']) get_fields.update(['generation', 'metageneration']) # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then # iterate over the expanded bucket strings and handle any object # wildcarding. for bucket_listing_ref in self._ExpandBucketWildcards( bucket_fields=['id']): bucket_url_string = bucket_listing_ref.url_string if self.wildcard_url.IsBucket(): # IsBucket() guarantees there are no prefix or object wildcards, and # thus this is a top-level listing of buckets. if expand_top_level_buckets: url = StorageUrlFromString(bucket_url_string) for obj_or_prefix in self.gsutil_api.ListObjects( url.bucket_name, delimiter='/', all_versions=self.all_versions, provider=self.wildcard_url.scheme, fields=bucket_listing_fields): if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: yield self._GetObjectRef( bucket_url_string, obj_or_prefix.data, with_version=self.all_versions) else: # CloudApi.CsObjectOrPrefixType.PREFIX: yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) else: yield bucket_listing_ref else: # By default, assume a non-wildcarded URL is an object, not a prefix. # This prevents unnecessary listings (which are slower, more expensive, # and also subject to eventual consistency). if (not ContainsWildcard(self.wildcard_url.url_string) and self.wildcard_url.IsObject() and not self.all_versions): try: get_object = self.gsutil_api.GetObjectMetadata( self.wildcard_url.bucket_name, self.wildcard_url.object_name, generation=self.wildcard_url.generation, provider=self.wildcard_url.scheme, fields=get_fields) yield self._GetObjectRef( self.wildcard_url.bucket_url_string, get_object, with_version=(self.all_versions or single_version_request)) return except (NotFoundException, AccessDeniedException): # It's possible this is a prefix - try to list instead. pass # Expand iteratively by building prefix/delimiter bucket listing # request, filtering the results per the current level's wildcard # (if present), and continuing with the next component of the # wildcard. See _BuildBucketFilterStrings() documentation for details. if single_version_request: url_string = '%s%s#%s' % (bucket_url_string, self.wildcard_url.object_name, self.wildcard_url.generation) else: # Rstrip any prefixes to correspond with rstripped prefix wildcard # from _BuildBucketFilterStrings(). url_string = '%s%s' % ( bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or '/' ) # Cover root object named '/' case. urls_needing_expansion = [url_string] while urls_needing_expansion: url = StorageUrlFromString(urls_needing_expansion.pop(0)) (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (self._BuildBucketFilterStrings( url.object_name)) prog = re.compile(fnmatch.translate(prefix_wildcard)) # If we have a suffix wildcard, we only care about listing prefixes. listing_fields = (set(['prefixes']) if suffix_wildcard else bucket_listing_fields) # List bucket for objects matching prefix up to delimiter. for obj_or_prefix in self.gsutil_api.ListObjects( url.bucket_name, prefix=prefix, delimiter=delimiter, all_versions=self.all_versions or single_version_request, provider=self.wildcard_url.scheme, fields=listing_fields): if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: gcs_object = obj_or_prefix.data if prog.match(gcs_object.name): if not suffix_wildcard or (StripOneSlash( gcs_object.name) == suffix_wildcard): if not single_version_request or ( self._SingleVersionMatches( gcs_object.generation)): yield self._GetObjectRef( bucket_url_string, gcs_object, with_version=( self.all_versions or single_version_request)) else: # CloudApi.CsObjectOrPrefixType.PREFIX prefix = obj_or_prefix.data if ContainsWildcard(prefix): # TODO: Disambiguate user-supplied strings from iterated # prefix and object names so that we can better reason # about wildcards and handle this case without raising an error. raise CommandException( 'Cloud folder %s%s contains a wildcard; gsutil does ' 'not currently support objects with wildcards in their ' 'name.' % (bucket_url_string, prefix)) # If the prefix ends with a slash, remove it. Note that we only # remove one slash so that we can successfully enumerate dirs # containing multiple slashes. rstripped_prefix = StripOneSlash(prefix) if prog.match(rstripped_prefix): if suffix_wildcard and rstripped_prefix != suffix_wildcard: # There's more wildcard left to expand. url_append_string = '%s%s' % ( bucket_url_string, rstripped_prefix + '/' + suffix_wildcard) urls_needing_expansion.append( url_append_string) else: # No wildcard to expand, just yield the prefix yield self._GetPrefixRef( bucket_url_string, prefix)
def _BuildBucketFilterStrings(self, wildcard): """Builds strings needed for querying a bucket and filtering results. This implements wildcard object name matching. Args: wildcard: The wildcard string to match to objects. Returns: (prefix, delimiter, prefix_wildcard, suffix_wildcard) where: prefix is the prefix to be sent in bucket GET request. delimiter is the delimiter to be sent in bucket GET request. prefix_wildcard is the wildcard to be used to filter bucket GET results. suffix_wildcard is wildcard to be appended to filtered bucket GET results for next wildcard expansion iteration. For example, given the wildcard gs://bucket/abc/d*e/f*.txt we would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket listing request will then produce a listing result set that can be filtered using this prefix_wildcard; and we'd use this suffix_wildcard to feed into the next call(s) to _BuildBucketFilterStrings(), for the next iteration of listing/filtering. Raises: AssertionError if wildcard doesn't contain any wildcard chars. """ # Generate a request prefix if the object name part of the wildcard starts # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). match = WILDCARD_REGEX.search(wildcard) if not match: # Input "wildcard" has no wildcard chars, so just return tuple that will # cause a bucket listing to match the given input wildcard. Example: if # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, # the next iteration will call _BuildBucketFilterStrings() with # gs://bucket/dir/abc, and we will return prefix ='dir/abc', # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. prefix = wildcard delimiter = '/' prefix_wildcard = wildcard suffix_wildcard = '' else: if match.start() > 0: # Wildcard does not occur at beginning of object name, so construct a # prefix string to send to server. prefix = wildcard[:match.start()] wildcard_part = wildcard[match.start():] else: prefix = None wildcard_part = wildcard end = wildcard_part.find('/') if end != -1: wildcard_part = wildcard_part[:end+1] # Remove trailing '/' so we will match gs://bucket/abc* as well as # gs://bucket/abc*/ with the same wildcard regex. prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) suffix_wildcard = wildcard[match.end():] end = suffix_wildcard.find('/') if end == -1: suffix_wildcard = '' else: suffix_wildcard = suffix_wildcard[end+1:] # To implement recursive (**) wildcarding, if prefix_wildcard # suffix_wildcard starts with '**' don't send a delimiter, and combine # suffix_wildcard at end of prefix_wildcard. if prefix_wildcard.find('**') != -1: delimiter = None prefix_wildcard += suffix_wildcard suffix_wildcard = '' else: delimiter = '/' # The following debug output is useful for tracing how the algorithm # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt if self.debug > 1: sys.stderr.write( 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' 'prefix_wildcard=%s, suffix_wildcard=%s\n' % (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) return (prefix, delimiter, prefix_wildcard, suffix_wildcard)