def _attempt_resumable_download(self, key, fp, headers, cb, num_cb, torrent, version_id, hash_algs): """ Attempts a resumable download. Raises ResumableDownloadException if any problems occur. """ cur_file_size = get_cur_file_size(fp, position_to_eof=True) if (cur_file_size and self.etag_value_for_current_download and self.etag_value_for_current_download == key.etag.strip('"\'')): # Try to resume existing transfer. if cur_file_size > key.size: raise ResumableDownloadException( '%s is larger (%d) than %s (%d).\nDeleting tracker file, so ' 'if you re-try this download it will start from scratch' % (fp.name, cur_file_size, str( storage_uri_for_key(key)), key.size), ResumableTransferDisposition.ABORT) elif cur_file_size == key.size: if key.bucket.connection.debug >= 1: print('Download complete.') return if key.bucket.connection.debug >= 1: print('Resuming download.') headers = headers.copy() headers['Range'] = 'bytes=%d-%d' % (cur_file_size, key.size - 1) cb = ByteTranslatingCallbackHandler(cb, cur_file_size).call self.download_start_point = cur_file_size else: if key.bucket.connection.debug >= 1: print('Starting new resumable download.') self._save_tracker_info(key) self.download_start_point = 0 # Truncate the file, in case a new resumable download is being # started atop an existing file. fp.truncate(0) # Disable AWSAuthConnection-level retry behavior, since that would # cause downloads to restart from scratch. if isinstance(key, GSKey): key.get_file(fp, headers, cb, num_cb, torrent, version_id, override_num_retries=0, hash_algs=hash_algs) else: key.get_file(fp, headers, cb, num_cb, torrent, version_id, override_num_retries=0) fp.flush()
def _save_tracker_info(self, key): self.etag_value_for_current_download = key.etag.strip('"\'') if not self.tracker_file_name: return f = None try: f = open(self.tracker_file_name, 'w') f.write('%s\n' % self.etag_value_for_current_download) except IOError, e: raise ResumableDownloadException( 'Couldn\'t write tracker file (%s): %s.\nThis can happen' 'if you\'re using an incorrectly configured download tool\n' '(e.g., gsutil configured to save tracker files to an ' 'unwritable directory)' % (self.tracker_file_name, e.strerror), ResumableTransferDisposition.ABORT)
def _check_final_md5(self, key, file_name): """ Checks that etag from server agrees with md5 computed after the download completes. This is important, since the download could have spanned a number of hours and multiple processes (e.g., gsutil runs), and the user could change some of the file and not realize they have inconsistent data. """ fp = open(file_name, 'r') if key.bucket.connection.debug >= 1: print 'Checking md5 against etag.' hex_md5 = key.compute_md5(fp)[0] if hex_md5 != key.etag.strip('"\''): file_name = fp.name fp.close() os.unlink(file_name) raise ResumableDownloadException( 'File changed during download: md5 signature doesn\'t match ' 'etag (incorrect downloaded file deleted)', ResumableTransferDisposition.ABORT)
else: if debug >= 1: print( 'Caught ResumableDownloadException (%s) - will ' 'retry' % e.message) # At this point we had a re-tryable failure; see if made progress. if get_cur_file_size(fp) > had_file_bytes_before_attempt: progress_less_iterations = 0 else: progress_less_iterations += 1 if progress_less_iterations > self.num_retries: # Don't retry any longer in the current process. raise ResumableDownloadException( 'Too many resumable download attempts failed without ' 'progress. You might try this download again later', ResumableTransferDisposition.ABORT_CUR_PROCESS) # Close the key, in case a previous download died partway # through and left data in the underlying key HTTP buffer. key.close() sleep_time_secs = 2**progress_less_iterations if debug >= 1: print( 'Got retryable failure (%d progress-less in a row).\n' 'Sleeping %d seconds before re-trying' % (progress_less_iterations, sleep_time_secs)) time.sleep(sleep_time_secs)
def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False, version_id=None, hash_algs=None): """ Retrieves a file from a Key :type key: :class:`boto.s3.key.Key` or subclass :param key: The Key object from which upload is to be downloaded :type fp: file :param fp: File pointer into which data should be downloaded :type headers: string :param: headers to send when retrieving the files :type cb: function :param cb: (optional) a callback function that will be called to report progress on the download. The callback should accept two integer parameters, the first representing the number of bytes that have been successfully transmitted from the storage service and the second representing the total number of bytes that need to be transmitted. :type num_cb: int :param num_cb: (optional) If a callback is specified with the cb parameter this parameter determines the granularity of the callback by defining the maximum number of times the callback will be called during the file transfer. :type torrent: bool :param torrent: Flag for whether to get a torrent for the file :type version_id: string :param version_id: The version ID (optional) :type hash_algs: dictionary :param hash_algs: (optional) Dictionary of hash algorithms and corresponding hashing class that implements update() and digest(). Defaults to {'md5': hashlib/md5.md5}. Raises ResumableDownloadException if a problem occurs during the transfer. """ debug = key.bucket.connection.debug if not headers: headers = {} # Use num-retries from constructor if one was provided; else check # for a value specified in the boto config file; else default to 6. if self.num_retries is None: self.num_retries = config.getint('Boto', 'num_retries', 6) progress_less_iterations = 0 while True: # Retry as long as we're making progress. had_file_bytes_before_attempt = get_cur_file_size(fp) try: self._attempt_resumable_download(key, fp, headers, cb, num_cb, torrent, version_id, hash_algs) # Download succceded, so remove the tracker file (if have one). self._remove_tracker_file() # Previously, check_final_md5() was called here to validate # downloaded file's checksum, however, to be consistent with # non-resumable downloads, this call was removed. Checksum # validation of file contents should be done by the caller. if debug >= 1: print('Resumable download complete.') return except self.RETRYABLE_EXCEPTIONS as e: if debug >= 1: print('Caught exception (%s)' % e.__repr__()) if isinstance(e, IOError) and e.errno == errno.EPIPE: # Broken pipe error causes httplib to immediately # close the socket (http://bugs.python.org/issue5542), # so we need to close and reopen the key before resuming # the download. if isinstance(key, GSKey): key.get_file(fp, headers, cb, num_cb, torrent, version_id, override_num_retries=0, hash_algs=hash_algs) else: key.get_file(fp, headers, cb, num_cb, torrent, version_id, override_num_retries=0) except ResumableDownloadException as e: if (e.disposition == ResumableTransferDisposition.ABORT_CUR_PROCESS): if debug >= 1: print('Caught non-retryable ResumableDownloadException ' '(%s)' % e.message) raise elif (e.disposition == ResumableTransferDisposition.ABORT): if debug >= 1: print('Caught non-retryable ResumableDownloadException ' '(%s); aborting and removing tracker file' % e.message) self._remove_tracker_file() raise else: if debug >= 1: print('Caught ResumableDownloadException (%s) - will ' 'retry' % e.message) # At this point we had a re-tryable failure; see if made progress. if get_cur_file_size(fp) > had_file_bytes_before_attempt: progress_less_iterations = 0 else: progress_less_iterations += 1 if progress_less_iterations > self.num_retries: # Don't retry any longer in the current process. raise ResumableDownloadException( 'Too many resumable download attempts failed without ' 'progress. You might try this download again later', ResumableTransferDisposition.ABORT_CUR_PROCESS) # Close the key, in case a previous download died partway # through and left data in the underlying key HTTP buffer. # Do this within a try/except block in case the connection is # closed (since key.close() attempts to do a final read, in which # case this read attempt would get an IncompleteRead exception, # which we can safely ignore. try: key.close() except httplib.IncompleteRead: pass sleep_time_secs = 2**progress_less_iterations if debug >= 1: print('Got retryable failure (%d progress-less in a row).\n' 'Sleeping %d seconds before re-trying' % (progress_less_iterations, sleep_time_secs)) time.sleep(sleep_time_secs)