def net_post_URL(url, data): page_data = '' req = Request(url, data) req.add_unredirected_header('User-Agent', USER_AGENT) req.add_header("Content-type", "application/x-www-form-urlencoded") req.add_header("Acept", "text/plain") log_debug('net_post_URL() POST URL "{0}"'.format(req.get_full_url())) try: f = urlopen(req, timeout=120) page_bytes = f.read() f.close() # If an exception happens return empty data. except IOError as ex: log_error('(IOError exception) In net_get_URL()') log_error('Message: {0}'.format(str(ex))) return page_data except Exception as ex: log_error('(General exception) In net_get_URL()') log_error('Message: {0}'.format(str(ex))) return page_data num_bytes = len(page_bytes) log_debug('net_post_URL() Read {0} bytes'.format(num_bytes)) # --- Convert page data to Unicode --- encoding = f.headers['content-type'].split('charset=')[-1] page_data = net_decode_URL_data(page_bytes, encoding) return page_data
def net_decode_URL_data(page_bytes, encoding): # --- Try to guess enconding --- if encoding == 'text/html': encoding = 'utf-8' elif encoding == 'application/json': encoding = 'utf-8' elif encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8' elif encoding == 'text/plain' and 'UTF-16' in page_bytes: encoding = 'utf-16' else: encoding = 'utf-8' log_debug('net_decode_URL_data() encoding = "{0}"'.format(encoding)) # --- Decode --- if encoding == 'utf-16': page_data = page_bytes.encode('utf-16') else: # python3: page_data = str(page_bytes, encoding) page_data = unicode(page_bytes, encoding) return page_data
def net_get_URL_using_handler(url, handler=None): page_data = None opener = build_opener(handler) log_debug('net_get_URL_using_handler() Reading URL "{0}"'.format(url)) try: f = opener.open(url) encoding = f.headers['content-type'].split('charset=')[-1] page_bytes = f.read() f.close() except IOError as e: log_error('(IOError) Exception in net_get_URL_using_handler()') log_error('(IOError) {0}'.format(str(e))) return page_data num_bytes = len(page_bytes) log_debug('net_get_URL_using_handler() Read {0} bytes'.format(num_bytes)) # --- Convert to Unicode --- if encoding == 'text/html': encoding = 'utf-8' if encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8' if encoding == 'text/plain' and 'UTF-16' in page_bytes: encoding = 'utf-16' log_debug('net_get_URL_using_handler() encoding = "{0}"'.format(encoding)) if encoding != 'utf-16': page_data = unicode(page_bytes, encoding) if encoding == 'utf-16': page_data = page_bytes.encode('utf-16') return page_data
def net_post_URL_original(url, params): page_data = '' req = Request(url, params) req.add_unredirected_header('User-Agent', USER_AGENT) req.add_header("Content-type", "application/x-www-form-urlencoded") req.add_header("Acept", "text/plain") log_debug('net_post_URL_original() POSTING URL "{0}"'.format( req.get_full_url())) try: f = urlopen(req) encoding = f.headers['content-type'].split('charset=')[-1] page_bytes = f.read() f.close() except IOError as e: log_error('(IOError) Exception in net_post_URL_original()') log_error('(IOError) {0}'.format(str(e))) return page_data num_bytes = len(page_bytes) log_debug('net_post_URL_original() Read {0} bytes'.format(num_bytes)) # --- Convert to Unicode --- if encoding == 'text/html': encoding = 'utf-8' if encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8' if encoding == 'text/plain' and 'UTF-16' in page_bytes: encoding = 'utf-16' if encoding == 'application/json': encoding = 'utf-8' log_debug('net_post_URL_original() encoding = "{0}"'.format(encoding)) if encoding != 'utf-16': #python3: page_data = str(page_bytes, encoding) page_data = unicode(page_bytes, encoding) if encoding == 'utf-16': page_data = page_bytes.encode('utf-16') return page_data
def net_get_URL_oneline(url): page_data = '' req = Request(url) req.add_unredirected_header('User-Agent', USER_AGENT) log_debug('net_get_URL_oneline() Reading URL "{0}"'.format( req.get_full_url())) try: # --- Open network connection (socket) --- f = urlopen(req) # --- Read data from socket --- encoding = f.headers['content-type'].split('charset=')[-1] # >> Fix for wrong encodings... if encoding == 'text/html': encoding = 'utf-8' log_debug('net_get_URL_oneline() Encoding = "{0}"'.format(encoding)) page_bytes = f.read() f.close() except IOError as e: log_error('(IOError) Exception in net_get_URL_oneline()') log_error('(IOError) {0}'.format(str(e))) return page_data except Exception as e: log_error('(Error) Exception in net_get_URL_oneline()') log_error('(Error) {0}'.format(str(e))) return page_data # --- Convert to Unicode --- num_bytes = len(page_bytes) log_debug('net_get_URL_oneline() Read {0} bytes'.format(num_bytes)) #python 3: page_data = str(page_bytes, encoding) page_data = unicode(page_bytes, encoding) # --- Put all page text into one line --- page_data = page_data.replace('\r\n', '') page_data = page_data.replace('\n', '') return page_data
def net_get_URL(url, url_log=None, headers=None): import traceback req = Request(url) req.add_unredirected_header('User-Agent', USER_AGENT) if headers is not None: for key, value in headers.items(): req.add_header(key, value) if url_log is None: log_debug('net_get_URL() GET URL "{}"'.format(req.get_full_url())) else: log_debug('net_get_URL() GET URL "{}"'.format(url_log)) page_bytes = http_code = None try: f = urlopen(req, timeout=120) http_code = f.getcode() page_bytes = f.read() f.close() # If the server returns an HTTP status code then make sure http_code has the error code # and page_bytes the message. except HTTPError as ex: log_error('(HTTPError) In net_get_URL()') http_code = ex.code # Try to read contents of the web page. # If it fails get error string from the exception object. try: page_bytes = ex.read() ex.close() except: page_bytes = unicode(ex.reason) log_error('(HTTPError) Object type "{}"'.format(type(ex))) log_error('(HTTPError) Message "{}"'.format(str(ex))) log_error('(HTTPError) Code {}'.format(http_code)) log_debug(traceback.format_exc()) return page_bytes, http_code # If an unknown exception happens return empty data. except Exception as ex: log_error('(Exception) In net_get_URL()') log_error('(Exception) Object type "{}"'.format(type(ex))) log_error('(Exception) Message "{}"'.format(str(ex))) log_debug(traceback.format_exc()) return page_bytes, http_code log_debug('net_get_URL() Read {} bytes'.format(len(page_bytes))) log_debug('net_get_URL() HTTP status code {}'.format(http_code)) # --- Convert to Unicode --- encoding = f.headers['content-type'].split('charset=')[-1] page_data = net_decode_URL_data(page_bytes, encoding) return page_data, http_code