def check_response(response, request_url, who, good_status=(201, 200), ignore_no_success=False): """ Checks the response and raises exceptions if something went terribly wrong :param who: A short name that indicated where the error occurred (for example "CKAN") :param good_status: Status codes that should not raise an exception """ if not response.status_code: raise HTTPError( 'Xloader received an HTTP response with no status code', status_code=None, request_url=request_url, response=response.text) message = '{who} bad response. Status code: {code} {reason}. At: {url}.' try: if response.status_code not in good_status: json_response = response.json() if not ignore_no_success or json_response.get('success'): try: message = json_response["error"]["message"] except Exception: message = message.format(who=who, code=response.status_code, reason=response.reason, url=request_url) raise HTTPError(message, status_code=response.status_code, request_url=request_url, response=response.text) except ValueError: message = message.format(who=who, code=response.status_code, reason=response.reason, url=request_url, resp=response.text[:200]) raise HTTPError(message, status_code=response.status_code, request_url=request_url, response=response.text)
headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key response = requests.get(resource.get('url'), headers=headers, timeout=DOWNLOAD_TIMEOUT) response.raise_for_status() except requests.exceptions.HTTPError as error: # status code error logger.error('HTTP error: {}'.format(error)) raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=error.response.status_code, request_url=resource.get('url'), response=error) except requests.exceptions.Timeout: logger.error('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) raise JobError( 'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) except AttributeError: err_message = str(e) logger.error('URL error: {}'.format(err_message)) raise HTTPError(message=err_message, status_code=None, request_url=resource.get('url'),
line_count = 0 m = hashlib.md5() for line in response.iter_lines(CHUNK_SIZE): tmp_file.write(line + '\n') m.update(line) length += len(line) line_count += 1 if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES: break data['datastore_contains_all_records_of_source_file'] = False except requests.exceptions.HTTPError as error: # status code error logger.debug('HTTP error: {}'.format(error)) raise HTTPError( "Xloader received a bad HTTP response when trying to download " "the data file", status_code=error.response.status_code, request_url=url, response=error) except requests.exceptions.Timeout: logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) raise JobError( 'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) except AttributeError: err_message = str(e) logger.warning('URL error: {}'.format(err_message)) raise HTTPError(message=err_message, status_code=None, request_url=url,
def _download_resource_data(resource, data, api_key, logger): '''Downloads the resource['url'] as a tempfile. :param resource: resource (i.e. metadata) dict (from the job dict) :param data: job dict - may be written to during this function :param api_key: CKAN api key - needed to obtain resources that are private :param logger: If the download is bigger than MAX_CONTENT_LENGTH then it just downloads a excerpt (of MAX_EXCERPT_LINES) for preview, and flags it by setting data['datastore_contains_all_records_of_source_file'] = False which will be saved to the resource later on. ''' # check scheme url = resource.get('url') scheme = urlparse.urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) length = 0 m = hashlib.md5() cl = None try: headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key response = get_response(url, headers) cl = response.headers.get('content-length') if cl and int(cl) > MAX_CONTENT_LENGTH: raise DataTooBigError() # download the file to a tempfile on disk for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise DataTooBigError tmp_file.write(chunk) m.update(chunk) data['datastore_contains_all_records_of_source_file'] = True except DataTooBigError: tmp_file.close() message = 'Data too large to load into Datastore: ' \ '{cl} bytes > max {max_cl} bytes.' \ .format(cl=cl or length, max_cl=MAX_CONTENT_LENGTH) logger.warning(message) if MAX_EXCERPT_LINES <= 0: raise JobError(message) logger.info('Loading excerpt of ~{max_lines} lines to ' 'DataStore.' .format(max_lines=MAX_EXCERPT_LINES)) tmp_file = get_tmp_file(url) response = get_response(url, headers) length = 0 line_count = 0 m = hashlib.md5() for line in response.iter_lines(CHUNK_SIZE): tmp_file.write(line + '\n') m.update(line) length += len(line) line_count += 1 if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES: break data['datastore_contains_all_records_of_source_file'] = False except requests.exceptions.HTTPError as error: # status code error logger.debug('HTTP error: {}'.format(error)) raise HTTPError( "Xloader received a bad HTTP response when trying to download " "the data file", status_code=error.response.status_code, request_url=url, response=error) except requests.exceptions.Timeout: logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) raise JobError('Connection timed out after {}s'.format( DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) except AttributeError: err_message = str(e) logger.warning('URL error: {}'.format(err_message)) raise HTTPError( message=err_message, status_code=None, request_url=url, response=None) logger.info('Downloaded ok - %s', printable_file_size(length)) file_hash = m.hexdigest() tmp_file.seek(0) return tmp_file, file_hash