def drain(self): """ Read and fail all requests remaining in the queue. """ for request in NectarFeed(self): report = NectarDownloadReport.from_download_request(request) self.downloader.fire_download_failed(report)
def _common_link(self, link_method, request, report=None): """ Link files using either a hard link or symbolic link method. :param link_method: hard link or symbolic link method :type link_method: callable :param request: request instance :type request: nectar.request.DownloadRequest :param report: report instance for the request :type report: nectar.report.DownloadReport :return: report instance :rtype: nectar.report.DownloadReport """ report = report or DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) if self.is_canceled: report.download_cancelled() return report try: if not isinstance(request.destination, basestring): raise UnlinkableDestination(request.destination) src_path = self._file_path_from_url(request.url) link_method(src_path, request.destination) except Exception, e: _LOG.exception(e) report.error_msg = str(e) report.download_failed()
def _copy(self, request, report=None): """ Copy the source file to the destination. This is the default behavior and most useful for files that live on different disk partitions or networked file systems. :param request: request instance :type request: nectar.request.DownloadRequest :param report: report instance for the request :type report: nectar.report.DownloadReport :return: report instance :rtype: nectar.report.DownloadReport """ report = report or DownloadReport.from_download_request(request) report.download_started() src_handle = None try: src_path = self._file_path_from_url(request.url) src_handle = open(src_path, 'rb') dst_handle = request.initialize_file_handle() buffer_size = self.buffer_size self.fire_download_started(report) last_progress_update = datetime.datetime.now() while True: if self.is_canceled: report.download_canceled() # NOTE the control flow here will pass through the finally # block on the way out, but not the else block :D return report chunk = src_handle.read(buffer_size) if not chunk: break dst_handle.write(chunk) report.bytes_downloaded += len(chunk) now = datetime.datetime.now() if now - last_progress_update < self.progress_interval: continue self.fire_download_progress(report) last_progress_update = now except Exception, e: logger.exception(e) report.error_msg = str(e) report.download_failed()
def _copy(self, request, report=None): """ Copy the source file to the destination. This is the default behavior and most useful for files that live on different disk partitions or networked file systems. :param request: request instance :type request: nectar.request.DownloadRequest :param report: report instance for the request :type report: nectar.report.DownloadReport :return: report instance :rtype: nectar.report.DownloadReport """ report = report or DownloadReport.from_download_request(request) report.download_started() src_handle = None try: src_path = self._file_path_from_url(request.url) src_handle = open(src_path, 'rb') dst_handle = request.initialize_file_handle() buffer_size = self.buffer_size self.fire_download_started(report) last_progress_update = datetime.datetime.now() while True: if self.is_canceled or request.canceled: report.download_canceled() # NOTE the control flow here will pass through the finally # block on the way out, but not the else block :D return report chunk = src_handle.read(buffer_size) if not chunk: break dst_handle.write(chunk) report.bytes_downloaded += len(chunk) now = datetime.datetime.now() if now - last_progress_update < self.progress_interval: continue self.fire_download_progress(report) last_progress_update = now except IOError, e: logger.debug(e) report.error_msg = str(e) report.download_failed()
def test_calls_fetch(self, mock_fetch): config = DownloaderConfig() request = DownloadRequest('http://foo', StringIO()) report = DownloadReport.from_download_request(request) downloader = threaded.HTTPThreadedDownloader(config) mock_fetch.return_value = report ret = downloader._download_one(request) self.assertEqual(mock_fetch.call_count, 1) self.assertTrue(ret is report) self.assertTrue(mock_fetch.call_args[0][0] is request)
def test_calls_download_method(self): config = DownloaderConfig() listener = AggregatingEventListener() downloader = local.LocalFileDownloader(config, listener) request = DownloadRequest('http://foo', StringIO()) report = DownloadReport.from_download_request(request) # mock _copy, which is the default function to which requests are passed with mock.patch.object(downloader, '_copy') as mock_method: mock_method.return_value = report ret = downloader._download_one(request) self.assertEqual(ret, report) mock_method.assert_called_once_with(request)
def _common_link(self, link_method, request, report=None): """ Link files using either a hard link or symbolic link method. :param link_method: hard link or symbolic link method :type link_method: callable :param request: request instance :type request: nectar.request.DownloadRequest :param report: report instance for the request :type report: nectar.report.DownloadReport :return: report instance :rtype: nectar.report.DownloadReport """ report = report or DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) if self.is_canceled or request.canceled: report.download_canceled() return report try: if not isinstance(request.destination, basestring): raise UnlinkableDestination(request.destination) src_path = self._file_path_from_url(request.url) link_method(src_path, request.destination) report.bytes_downloaded = os.path.getsize(request.destination) except OSError, e: logger.debug(e) report.error_msg = str(e) report.download_failed()
def _fetch(self, request): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :return: download report :rtype: nectar.report.DownloadReport """ headers = (self.config.headers or {}).copy() headers.update(request.headers or {}) ignore_encoding, additional_headers = self._rfc2616_workaround(request) headers.update(additional_headers or {}) max_speed = self._calculate_max_speed() # None or integer in bytes/second report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) netloc = urlparse.urlparse(request.url).netloc try: if self.is_canceled or request.canceled: raise DownloadCancelled(request.url) if netloc in self.failed_netlocs: raise SkipLocation() _logger.debug("Attempting to connect to {url}.".format(url=request.url)) requests_kwargs = self.requests_kwargs_from_nectar_config(self.config) response = self.session.get(request.url, headers=headers, timeout=(self.config.connect_timeout, self.config.read_timeout), **requests_kwargs) report.headers = response.headers self.fire_download_headers(report) if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) if ignore_encoding or self.config.stream: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled or request.canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) with self._bytes_lock: if now - self._time_bytes_this_second_was_cleared >= ONE_SECOND: self._bytes_this_second = 0 self._time_bytes_this_second_was_cleared = now self._bytes_this_second += bytes_read if max_speed is not None and self._bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) # guarantee 1 report at the end self.fire_download_progress(report) except SkipLocation: _logger.debug("Skipping {url} because {netloc} could not be reached.".format( url=request.url, netloc=netloc) ) report.download_skipped() except requests.ConnectionError as e: _logger.error(_('Skipping requests to {netloc} due to repeated connection' ' failures: {e}').format(netloc=netloc, e=str(e))) self.failed_netlocs.add(netloc) report.download_connection_error() except requests.Timeout: """ Handle a timeout differently than a connection error. Do not add to failed_netlocs so that a new connection can be attempted. """ _logger.warning("Request Timeout - Connection with {url} timed out.".format( url=request.url) ) report.download_connection_error() except DownloadCancelled as e: _logger.info(str(e)) report.download_canceled() except DownloadFailed as e: _logger.info('Download failed: %s' % str(e)) report.error_msg = e.args[2] report.error_report['response_code'] = e.args[1] report.error_report['response_msg'] = e.args[2] report.download_failed() except Exception as e: _logger.exception(e) report.error_msg = str(e) report.download_failed() else: _logger.info("Download succeeded: {url}.".format( url=request.url) ) report.download_succeeded() request.finalize_file_handle() if report.state is DOWNLOAD_SUCCEEDED: self.fire_download_succeeded(report) else: # DOWNLOAD_FAILED self.fire_download_failed(report) return report
def _fetch(self, request, session): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :param session: session object used by the requests library :type session: requests.sessions.Session """ ignore_encoding, headers = self._rfc2616_workaround(request) max_speed = self._calculate_max_speed() # None or integer in bytes/second report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) try: if self.is_canceled: raise DownloadCancelled(request.url) response = session.get(request.url, headers=headers) if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) if ignore_encoding: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) with self._bytes_lock: if now - self._time_bytes_this_second_was_cleared >= ONE_SECOND: self._bytes_this_second = 0 self._time_bytes_this_second_was_cleared = now self._bytes_this_second += bytes_read if max_speed is not None and self._bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) # guarantee 1 report at the end self.fire_download_progress(report) except DownloadCancelled, e: _LOG.debug(str(e)) report.download_canceled()
def _fetch(self, request, session): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :param session: session object used by the requests library :type session: requests.sessions.Session """ # this is to deal with broken web servers that violate RFC 2616 by sending # a header 'content-encoding: x-gzip' when it's really just a gzipped # file. In that case, we must ignore the declared encoding and thus prevent # the requests library from automatically decompressing the file. parse_url = urlparse.urlparse(request.url) if parse_url.path.endswith('.gz'): ignore_encoding = True # declare that we don't accept any encodings, so that if we do still # get a content-encoding value in the response, we know for sure the # other end is broken/misbehaving. headers = {'accept-encoding': ''} else: ignore_encoding = False headers = None max_speed = self.config.max_speed # None or integer in bytes/second if max_speed is not None: max_speed -= (2 * self.buffer_size) # because we test *after* reading and only sleep for 1/2 second max_speed = max(max_speed, (2 * self.buffer_size)) # because we cannot go slower report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) try: if self.is_canceled: raise DownloadCancelled(request.url) response = session.get(request.url, headers=headers) if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) # guarantee 1 report at the beginning if ignore_encoding: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) if now - session.nectar_time_bytes_this_second_was_cleared >= ONE_SECOND: session.nectar_bytes_this_second = 0 session.nectar_time_bytes_this_second_was_cleared = now session.nectar_bytes_this_second += bytes_read if max_speed is not None and session.nectar_bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) self.fire_download_progress(report) # guarantee 1 report at the end except DownloadCancelled, e: _LOG.debug(str(e)) report.download_canceled()
def _fetch(self, request): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :return: download report :rtype: nectar.report.DownloadReport """ headers = (request.headers or {}).copy() ignore_encoding, additional_headers = self._rfc2616_workaround(request) headers.update(additional_headers or {}) max_speed = self._calculate_max_speed() # None or integer in bytes/second report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) netloc = urlparse.urlparse(request.url).netloc try: if self.is_canceled or request.canceled: raise DownloadCancelled(request.url) if netloc in self.failed_netlocs: raise SkipLocation() _logger.debug("Attempting to connect to {url}.".format(url=request.url)) for attempt in xrange(self.tries): try: if attempt > 0: msg = _("Re-trying {url} due to remote server connection failure.".format( url=request.url) ) _logger.warning(msg) response = self.session.get(request.url, headers=headers, timeout=(self.config.connect_timeout, self.config.read_timeout)) report.headers = response.headers self.fire_download_headers(report) break except requests.ConnectionError as e: if isinstance(e.strerror, httplib.BadStatusLine): msg = _("Download of {url} failed. Re-trying.".format(url=request.url)) _logger.warning(msg) continue raise else: raise RetryError(request.url) if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) if ignore_encoding: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled or request.canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) with self._bytes_lock: if now - self._time_bytes_this_second_was_cleared >= ONE_SECOND: self._bytes_this_second = 0 self._time_bytes_this_second_was_cleared = now self._bytes_this_second += bytes_read if max_speed is not None and self._bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) # guarantee 1 report at the end self.fire_download_progress(report) except SkipLocation: _logger.debug("Skipping {url} because {netloc} could not be reached.".format( url=request.url, netloc=netloc) ) report.download_skipped() except requests.ConnectionError as e: _logger.warning("Connection Error - {url} could not be reached.".format( url=request.url) ) self.failed_netlocs.add(netloc) report.download_connection_error() except RetryError as e: _logger.warning(str(e)) self.failed_netlocs.add(netloc) report.download_connection_error() except requests.Timeout: """ Handle a timeout differently than a connection error. Do not add to failed_netlocs so that a new connection can be attempted. """ _logger.warning("Request Timeout - Connection with {url} timed out.".format( url=request.url) ) report.download_connection_error() except DownloadCancelled as e: _logger.debug(str(e)) report.download_canceled() except DownloadFailed as e: _logger.debug('download failed: %s' % str(e)) report.error_msg = e.args[2] report.error_report['response_code'] = e.args[1] report.error_report['response_msg'] = e.args[2] report.download_failed() except Exception as e: _logger.exception(e) report.error_msg = str(e) report.download_failed() else: report.download_succeeded() request.finalize_file_handle() if report.state is DOWNLOAD_SUCCEEDED: self.fire_download_succeeded(report) else: # DOWNLOAD_FAILED self.fire_download_failed(report) return report
def _download_one(self, request): # let's not, but say we did report = DownloadReport.from_download_request(request) self.fire_download_succeeded(report) return report
def _fetch(self, request, session): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :param session: session object used by the requests library :type session: requests.sessions.Session :return: download report :rtype: nectar.report.DownloadReport """ headers = (request.headers or {}).copy() ignore_encoding, additional_headers = self._rfc2616_workaround(request) headers.update(additional_headers or {}) max_speed = self._calculate_max_speed( ) # None or integer in bytes/second report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) try: if self.is_canceled: raise DownloadCancelled(request.url) response = session.get(request.url, headers=headers) report.headers = response.headers if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) if ignore_encoding: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) with self._bytes_lock: if now - self._time_bytes_this_second_was_cleared >= ONE_SECOND: self._bytes_this_second = 0 self._time_bytes_this_second_was_cleared = now self._bytes_this_second += bytes_read if max_speed is not None and self._bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) # guarantee 1 report at the end self.fire_download_progress(report) except DownloadCancelled, e: _LOG.debug(str(e)) report.download_canceled()
def _fetch(self, request, session): """ :param request: download request object with details about what to download and where to put it :type request: nectar.request.DownloadRequest :param session: session object used by the requests library :type session: requests.sessions.Session :return: download report :rtype: nectar.report.DownloadReport """ headers = (request.headers or {}).copy() ignore_encoding, additional_headers = self._rfc2616_workaround(request) headers.update(additional_headers or {}) max_speed = self._calculate_max_speed( ) # None or integer in bytes/second report = DownloadReport.from_download_request(request) report.download_started() self.fire_download_started(report) netloc = urlparse.urlparse(request.url).netloc try: if self.is_canceled: raise DownloadCancelled(request.url) if netloc in self.failed_netlocs: raise SkipLocation() response = session.get(request.url, headers=headers, timeout=(self.config.connect_timeout, self.config.read_timeout)) report.headers = response.headers if response.status_code != httplib.OK: raise DownloadFailed(request.url, response.status_code, response.reason) progress_interval = self.progress_interval file_handle = request.initialize_file_handle() last_update_time = datetime.datetime.now() self.fire_download_progress(report) if ignore_encoding: chunks = self.chunk_generator(response.raw, self.buffer_size) else: chunks = response.iter_content(self.buffer_size) for chunk in chunks: if self.is_canceled: raise DownloadCancelled(request.url) file_handle.write(chunk) bytes_read = len(chunk) report.bytes_downloaded += bytes_read now = datetime.datetime.now() if now - last_update_time >= progress_interval: last_update_time = now self.fire_download_progress(report) with self._bytes_lock: if now - self._time_bytes_this_second_was_cleared >= ONE_SECOND: self._bytes_this_second = 0 self._time_bytes_this_second_was_cleared = now self._bytes_this_second += bytes_read if max_speed is not None and self._bytes_this_second >= max_speed: # it's not worth doing fancier mathematics than this, very # fine-grained sleep times [1] are not honored by the system # [1] for example, sleeping the remaining fraction of time # before this second is up time.sleep(0.5) # guarantee 1 report at the end self.fire_download_progress(report) except SkipLocation: _logger.debug( "Skipping {url} because {netloc} could not be reached.".format( url=request.url, netloc=netloc)) report.download_skipped() except requests.ConnectionError: _logger.warning( "Connection Error - {url} could not be reached.".format( url=request.url)) self.failed_netlocs.add(netloc) report.download_connection_error() except requests.Timeout: """ Handle a timeout differently than a connection error. Do not add to failed_netlocs so that a new connection can be attempted. """ _logger.warning( "Request Timeout - Connection with {url} timed out.".format( url=request.url)) report.download_connection_error() except DownloadCancelled, e: _logger.debug(str(e)) report.download_canceled()