def CatUrlStrings(self, url_strings, show_header=False, start_byte=0, end_byte=None): """Prints each of the url strings to stdout. Args: url_strings: String iterable. show_header: If true, print a header per file. start_byte: Starting byte of the file to print, used for constructing range requests. end_byte: Ending byte of the file to print; used for constructing range requests. If this is negative, the start_byte is ignored and and end range is sent over HTTP (such as range: bytes -9) Returns: 0 on success. Raises: CommandException if no URLs can be found. """ printed_one = False # We manipulate the stdout so that all other data other than the Object # contents go to stderr. cat_outfd = sys.stdout sys.stdout = sys.stderr try: for url_str in url_strings: did_some_work = False # TODO: Get only the needed fields here. for blr in self.command_obj.WildcardIterator( url_str).IterObjects(): did_some_work = True if show_header: if printed_one: print print '==> %s <==' % blr printed_one = True cat_object = blr.root_object storage_url = StorageUrlFromString(blr.url_string) if storage_url.IsCloudUrl(): self.command_obj.gsutil_api.GetObjectMedia( cat_object.bucket, cat_object.name, cat_outfd, start_byte=start_byte, end_byte=end_byte, object_size=cat_object.size, generation=storage_url.generation, provider=storage_url.scheme) else: cat_outfd.write( open(storage_url.object_name, 'rb').read()) if not did_some_work: raise CommandException('No URLs matched %s' % url_str) sys.stdout = cat_outfd finally: sys.stdout = cat_outfd return 0
def __iter__(self): """Iterates over all source URLs passed to the iterator. For each src url, expands wildcards, object-less bucket names, subdir bucket names, and directory names, and generates a flat listing of all the matching objects/files. You should instantiate this object using the static factory function NameExpansionIterator, because consumers of this iterator need the PluralityCheckableIterator wrapper built by that function. Yields: gslib.name_expansion.NameExpansionResult. Raises: CommandException: if errors encountered. """ for url_str in self.url_strs: storage_url = StorageUrlFromString(url_str) if storage_url.IsFileUrl() and storage_url.IsStream(): if self.url_strs.has_plurality: raise CommandException( 'Multiple URL strings are not supported ' 'with streaming ("-") URLs.') yield NameExpansionResult(storage_url, False, False, storage_url) continue # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd src_names_bucket = False if (storage_url.IsCloudUrl() and storage_url.IsBucket() and not self.recursion_requested): # UNIX commands like rm and cp will omit directory references. # If url_str refers only to buckets and we are not recursing, # then produce references of type BUCKET, because they are guaranteed # to pass through Step 2 and be omitted in Step 3. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterBuckets( bucket_fields=['id'])) else: # Get a list of objects and prefixes, expanding the top level for # any listed buckets. If our source is a bucket, however, we need # to treat all of the top level expansions as names_container=True. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterAll( bucket_listing_fields=['name'], expand_top_level_buckets=True)) if storage_url.IsCloudUrl() and storage_url.IsBucket(): src_names_bucket = True # Step 2: Expand bucket subdirs. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). subdir_exp_wildcard = self._flatness_wildcard[ self.recursion_requested] if self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator( self, post_step1_iter, subdir_exp_wildcard) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Because we actually perform and check object listings here, this will # raise if url_args includes a non-existent object. However, # plurality_checkable_iterator will buffer the exception for us, not # raising it until the iterator is actually asked to yield the first # result. if post_step2_iter.IsEmpty(): if self.continue_on_error: try: raise CommandException('No URLs matched: %s' % url_str) except CommandException, e: # Yield a specialized tuple of (exception, stack_trace) to # the wrapping PluralityCheckableIterator. yield (e, sys.exc_info()[2]) else: raise CommandException('No URLs matched: %s' % url_str) # Step 3. Omit any directories, buckets, or bucket subdirectories for # non-recursive expansions. post_step3_iter = PluralityCheckableIterator( _OmitNonRecursiveIterator(post_step2_iter, self.recursion_requested, self.command_name, self.cmd_supports_recursion, self.logger)) src_url_expands_to_multi = post_step3_iter.HasPlurality() is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # Step 4. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] for (names_container, blr) in post_step3_iter: src_names_container = src_names_bucket or names_container if blr.IsObject(): yield NameExpansionResult(storage_url, is_multi_source_request, src_names_container, blr.storage_url) else: # Use implicit wildcarding to do the enumeration. # At this point we are guaranteed that: # - Recursion has been requested because non-object entries are # filtered in step 3 otherwise. # - This is a prefix or bucket subdirectory because only # non-recursive iterations product bucket references. expanded_url = StorageUrlFromString(blr.url_string) if expanded_url.IsFileUrl(): # Convert dir to implicit recursive wildcard. url_to_iterate = '%s%s%s' % (blr, os.sep, subdir_exp_wildcard) else: # Convert subdir to implicit recursive wildcard. url_to_iterate = expanded_url.CreatePrefixUrl( wildcard_suffix=subdir_exp_wildcard) wc_iter = PluralityCheckableIterator( self.WildcardIterator(url_to_iterate).IterObjects( bucket_listing_fields=['name'])) src_url_expands_to_multi = (src_url_expands_to_multi or wc_iter.HasPlurality()) is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # This will be a flattened listing of all underlying objects in the # subdir. for blr in wc_iter: yield NameExpansionResult(storage_url, is_multi_source_request, True, blr.storage_url)
def CatUrlStrings(self, url_strings, show_header=False, start_byte=0, end_byte=None, cat_out_fd=None): """Prints each of the url strings to stdout. Args: url_strings: String iterable. show_header: If true, print a header per file. start_byte: Starting byte of the file to print, used for constructing range requests. end_byte: Ending byte of the file to print; used for constructing range requests. If this is negative, the start_byte is ignored and and end range is sent over HTTP (such as range: bytes -9) cat_out_fd: File descriptor to which output should be written. Defaults to stdout if no file descriptor is supplied. Returns: 0 on success. Raises: CommandException if no URLs can be found. """ printed_one = False # This should refer to whatever sys.stdin refers to when this method is # run, not when this method is defined, so we do the initialization here # rather than define sys.stdin as the cat_out_fd parameter's default value. if cat_out_fd is None: cat_out_fd = sys.stdout # We manipulate the stdout so that all other data other than the Object # contents go to stderr. old_stdout = sys.stdout sys.stdout = sys.stderr try: if url_strings and url_strings[0] in ('-', 'file://-'): self._WriteBytesBufferedFileToFile(sys.stdin, cat_out_fd) else: for url_str in url_strings: did_some_work = False # TODO: Get only the needed fields here. for blr in self.command_obj.WildcardIterator( url_str ).IterObjects( bucket_listing_fields=_CAT_BUCKET_LISTING_FIELDS): decryption_keywrapper = None if (blr.root_object and blr.root_object.customerEncryption and blr.root_object.customerEncryption.keySha256): decryption_key = FindMatchingCSEKInBotoConfig( blr.root_object.customerEncryption.keySha256, config) if not decryption_key: raise EncryptionException( 'Missing decryption key with SHA256 hash %s. No decryption ' 'key matches object %s' % (blr.root_object.customerEncryption. keySha256, blr.url_string)) decryption_keywrapper = CryptoKeyWrapperFromKey( decryption_key) did_some_work = True if show_header: if printed_one: print print '==> %s <==' % blr printed_one = True cat_object = blr.root_object storage_url = StorageUrlFromString(blr.url_string) if storage_url.IsCloudUrl(): compressed_encoding = ObjectIsGzipEncoded( cat_object) self.command_obj.gsutil_api.GetObjectMedia( cat_object.bucket, cat_object.name, cat_out_fd, compressed_encoding=compressed_encoding, start_byte=start_byte, end_byte=end_byte, object_size=cat_object.size, generation=storage_url.generation, decryption_tuple=decryption_keywrapper, provider=storage_url.scheme) else: with open(storage_url.object_name, 'rb') as f: self._WriteBytesBufferedFileToFile( f, cat_out_fd) if not did_some_work: raise CommandException(NO_URLS_MATCHED_TARGET % url_str) finally: sys.stdout = old_stdout return 0
def CatUrlStrings(self, url_strings, show_header=False, start_byte=0, end_byte=None): """Prints each of the url strings to stdout. Args: url_strings: String iterable. show_header: If true, print a header per file. start_byte: Starting byte of the file to print, used for constructing range requests. end_byte: Ending byte of the file to print; used for constructing range requests. If this is negative, the start_byte is ignored and and end range is sent over HTTP (such as range: bytes -9) Returns: 0 on success. Raises: CommandException if no URLs can be found. """ printed_one = False # We manipulate the stdout so that all other data other than the Object # contents go to stderr. cat_outfd = sys.stdout sys.stdout = sys.stderr try: for url_str in url_strings: did_some_work = False # TODO: Get only the needed fields here. for blr in self.command_obj.WildcardIterator(url_str).IterObjects(): decryption_tuple = None if (blr.root_object and blr.root_object.customerEncryption and blr.root_object.customerEncryption.keySha256): decryption_key = FindMatchingCryptoKey( blr.root_object.customerEncryption.keySha256) if not decryption_key: raise EncryptionException( 'Missing decryption key with SHA256 hash %s. No decryption ' 'key matches object %s' % (blr.root_object.customerEncryption.keySha256, blr.url_string)) decryption_tuple = CryptoTupleFromKey(decryption_key) did_some_work = True if show_header: if printed_one: print print '==> %s <==' % blr printed_one = True cat_object = blr.root_object storage_url = StorageUrlFromString(blr.url_string) if storage_url.IsCloudUrl(): compressed_encoding = ObjectIsGzipEncoded(cat_object) self.command_obj.gsutil_api.GetObjectMedia( cat_object.bucket, cat_object.name, cat_outfd, compressed_encoding=compressed_encoding, start_byte=start_byte, end_byte=end_byte, object_size=cat_object.size, generation=storage_url.generation, decryption_tuple=decryption_tuple, provider=storage_url.scheme) else: cat_outfd.write(open(storage_url.object_name, 'rb').read()) if not did_some_work: raise CommandException(NO_URLS_MATCHED_TARGET % url_str) sys.stdout = cat_outfd finally: sys.stdout = cat_outfd return 0