示例#1
0
def net_post_URL(url, data):
    page_data = ''
    req = Request(url, data)
    req.add_unredirected_header('User-Agent', USER_AGENT)
    req.add_header("Content-type", "application/x-www-form-urlencoded")
    req.add_header("Acept", "text/plain")
    log_debug('net_post_URL() POST URL "{0}"'.format(req.get_full_url()))

    try:
        f = urlopen(req, timeout=120)
        page_bytes = f.read()
        f.close()
    # If an exception happens return empty data.
    except IOError as ex:
        log_error('(IOError exception) In net_get_URL()')
        log_error('Message: {0}'.format(str(ex)))
        return page_data
    except Exception as ex:
        log_error('(General exception) In net_get_URL()')
        log_error('Message: {0}'.format(str(ex)))
        return page_data

    num_bytes = len(page_bytes)
    log_debug('net_post_URL() Read {0} bytes'.format(num_bytes))

    # --- Convert page data to Unicode ---
    encoding = f.headers['content-type'].split('charset=')[-1]
    page_data = net_decode_URL_data(page_bytes, encoding)

    return page_data
示例#2
0
def net_decode_URL_data(page_bytes, encoding):
    # --- Try to guess enconding ---
    if encoding == 'text/html': encoding = 'utf-8'
    elif encoding == 'application/json': encoding = 'utf-8'
    elif encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8'
    elif encoding == 'text/plain' and 'UTF-16' in page_bytes:
        encoding = 'utf-16'
    else:
        encoding = 'utf-8'

    log_debug('net_decode_URL_data() encoding = "{0}"'.format(encoding))

    # --- Decode ---
    if encoding == 'utf-16':
        page_data = page_bytes.encode('utf-16')
    else:
        # python3: page_data = str(page_bytes, encoding)
        page_data = unicode(page_bytes, encoding)

    return page_data
示例#3
0
def net_get_URL_using_handler(url, handler=None):

    page_data = None
    opener = build_opener(handler)

    log_debug('net_get_URL_using_handler() Reading URL "{0}"'.format(url))
    try:
        f = opener.open(url)
        encoding = f.headers['content-type'].split('charset=')[-1]
        page_bytes = f.read()
        f.close()
    except IOError as e:
        log_error('(IOError) Exception in net_get_URL_using_handler()')
        log_error('(IOError) {0}'.format(str(e)))
        return page_data

    num_bytes = len(page_bytes)
    log_debug('net_get_URL_using_handler() Read {0} bytes'.format(num_bytes))

    # --- Convert to Unicode ---
    if encoding == 'text/html': encoding = 'utf-8'
    if encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8'
    if encoding == 'text/plain' and 'UTF-16' in page_bytes: encoding = 'utf-16'

    log_debug('net_get_URL_using_handler() encoding = "{0}"'.format(encoding))
    if encoding != 'utf-16':
        page_data = unicode(page_bytes, encoding)

    if encoding == 'utf-16':
        page_data = page_bytes.encode('utf-16')
    return page_data
示例#4
0
def net_post_URL_original(url, params):
    page_data = ''

    req = Request(url, params)
    req.add_unredirected_header('User-Agent', USER_AGENT)
    req.add_header("Content-type", "application/x-www-form-urlencoded")
    req.add_header("Acept", "text/plain")

    log_debug('net_post_URL_original() POSTING URL "{0}"'.format(
        req.get_full_url()))

    try:
        f = urlopen(req)
        encoding = f.headers['content-type'].split('charset=')[-1]
        page_bytes = f.read()
        f.close()
    except IOError as e:
        log_error('(IOError) Exception in net_post_URL_original()')
        log_error('(IOError) {0}'.format(str(e)))
        return page_data

    num_bytes = len(page_bytes)
    log_debug('net_post_URL_original() Read {0} bytes'.format(num_bytes))

    # --- Convert to Unicode ---
    if encoding == 'text/html': encoding = 'utf-8'
    if encoding == 'text/plain' and 'UTF-8' in page_bytes: encoding = 'utf-8'
    if encoding == 'text/plain' and 'UTF-16' in page_bytes: encoding = 'utf-16'
    if encoding == 'application/json': encoding = 'utf-8'

    log_debug('net_post_URL_original() encoding = "{0}"'.format(encoding))
    if encoding != 'utf-16':
        #python3: page_data = str(page_bytes, encoding)
        page_data = unicode(page_bytes, encoding)

    if encoding == 'utf-16':
        page_data = page_bytes.encode('utf-16')

    return page_data
示例#5
0
def net_get_URL_oneline(url):
    page_data = ''
    req = Request(url)
    req.add_unredirected_header('User-Agent', USER_AGENT)
    log_debug('net_get_URL_oneline() Reading URL "{0}"'.format(
        req.get_full_url()))

    try:
        # --- Open network connection (socket) ---
        f = urlopen(req)

        # --- Read data from socket ---
        encoding = f.headers['content-type'].split('charset=')[-1]
        # >> Fix for wrong encodings...
        if encoding == 'text/html': encoding = 'utf-8'
        log_debug('net_get_URL_oneline() Encoding = "{0}"'.format(encoding))
        page_bytes = f.read()
        f.close()
    except IOError as e:
        log_error('(IOError) Exception in net_get_URL_oneline()')
        log_error('(IOError) {0}'.format(str(e)))
        return page_data
    except Exception as e:
        log_error('(Error) Exception in net_get_URL_oneline()')
        log_error('(Error) {0}'.format(str(e)))
        return page_data

    # --- Convert to Unicode ---
    num_bytes = len(page_bytes)
    log_debug('net_get_URL_oneline() Read {0} bytes'.format(num_bytes))
    #python 3: page_data = str(page_bytes, encoding)
    page_data = unicode(page_bytes, encoding)

    # --- Put all page text into one line ---
    page_data = page_data.replace('\r\n', '')
    page_data = page_data.replace('\n', '')

    return page_data
示例#6
0
def net_get_URL(url, url_log=None, headers=None):
    import traceback
    req = Request(url)
    req.add_unredirected_header('User-Agent', USER_AGENT)

    if headers is not None:
        for key, value in headers.items():
            req.add_header(key, value)

    if url_log is None:
        log_debug('net_get_URL() GET URL "{}"'.format(req.get_full_url()))
    else:
        log_debug('net_get_URL() GET URL "{}"'.format(url_log))

    page_bytes = http_code = None
    try:
        f = urlopen(req, timeout=120)
        http_code = f.getcode()
        page_bytes = f.read()
        f.close()
    # If the server returns an HTTP status code then make sure http_code has the error code
    # and page_bytes the message.
    except HTTPError as ex:
        log_error('(HTTPError) In net_get_URL()')
        http_code = ex.code
        # Try to read contents of the web page.
        # If it fails get error string from the exception object.
        try:
            page_bytes = ex.read()
            ex.close()
        except:
            page_bytes = unicode(ex.reason)
        log_error('(HTTPError) Object type "{}"'.format(type(ex)))
        log_error('(HTTPError) Message "{}"'.format(str(ex)))
        log_error('(HTTPError) Code {}'.format(http_code))
        log_debug(traceback.format_exc())

        return page_bytes, http_code
    # If an unknown exception happens return empty data.
    except Exception as ex:
        log_error('(Exception) In net_get_URL()')
        log_error('(Exception) Object type "{}"'.format(type(ex)))
        log_error('(Exception) Message "{}"'.format(str(ex)))
        log_debug(traceback.format_exc())

        return page_bytes, http_code
    log_debug('net_get_URL() Read {} bytes'.format(len(page_bytes)))
    log_debug('net_get_URL() HTTP status code {}'.format(http_code))

    # --- Convert to Unicode ---
    encoding = f.headers['content-type'].split('charset=')[-1]
    page_data = net_decode_URL_data(page_bytes, encoding)

    return page_data, http_code