示例#1
0
def save_url(url, name, reporthook = simple_hook):
    bs = 1024*8
    size = -1
    read = 0
    blocknum = 0
    open_mode = 'wb'
    req = Request(url, headers = fake_headers)
    response = urlopen(req, None)
    if "content-length" in response.headers:
        size = int(response.headers["Content-Length"])
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        if filesize == size:
            print('Skipped: file already downloaded')
            return
        elif -1 != size:
            req.add_header('Range', 'bytes=%d-' % filesize)
            blocknum = int(filesize / bs)
            response = urlopen(req, None)
            open_mode = 'ab'
    reporthook(blocknum, bs, size)
    with open(name, open_mode) as tfp:
        while True:
            block = response.read(bs)
            if not block:
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            reporthook(blocknum, bs, size)
示例#2
0
def save_url(url, name, reporthook=simple_hook):
    bs = 1024 * 8
    size = -1
    read = 0
    blocknum = 0
    open_mode = 'wb'
    req = Request(url, headers=fake_headers)
    response = urlopen(req, None)
    if "content-length" in response.headers:
        size = int(response.headers["Content-Length"])
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        if filesize == size:
            print('Skipped: file already downloaded')
            return
        elif -1 != size:
            req.add_header('Range', 'bytes=%d-' % filesize)
            blocknum = int(filesize / bs)
            response = urlopen(req, None)
            open_mode = 'ab'
    reporthook(blocknum, bs, size)
    with open(name, open_mode) as tfp:
        while True:
            block = response.read(bs)
            if not block:
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            reporthook(blocknum, bs, size)
示例#3
0
文件: download.py 项目: txqwjh/ykdl
def save_url(url, name, ext, status, part=None, reporthook=simple_hook):
    if part is None:
        print('Download: ' + name)
        name = name + '.' + ext
        part = 0
    else:
        print('\nDownload: %s part %d' % (name, part))
        name = '%s_%d.%s' % (name, part, ext)
    bs = 1024 * 8
    size = -1
    read = 0
    blocknum = 0
    open_mode = 'wb'
    response = None
    req = Request(url, headers=fake_headers)
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        if filesize:
            req.add_header('Range',
                           'bytes=%d-' % (filesize - 1))  # get +1, avoid 416
            response = urlopen(req, None)
            assert response.status == 206, 'HTTP status %d' % response.status
            size = int(response.headers['Content-Range'].split('/')[-1])
            if filesize == size:
                print('Skipped: file already downloaded')
                status[part] = 1
                return
            if filesize < size:
                if filesize:
                    blocknum = int(filesize / bs)
                open_mode = 'ab'
                response.read(1)  # read -1
    if response is None:
        response = urlopen(req, None)
    if size < 0:
        size = int(response.headers.get('Content-Length', -1))
    reporthook(blocknum, bs, size)
    with open(name, open_mode) as tfp:
        while True:
            block = response.read(bs)
            if not block:
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            reporthook(blocknum, bs, size)
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        if filesize == size:
            status[part] = 1
示例#4
0
def get_head_response(url, headers=fake_headers):
    try:
        req = Request(url, headers=headers)
        req.get_method = lambda: 'HEAD'
        response = urlopen(req)
    except IOError as e:
        # if HEAD method is not supported
        if 'HTTP Error 405' in str(e):
            req = Request(url, headers=headers)
            response = urlopen(req)
            response.close()
        else:
            raise
    # urllib will follow redirections and it's too much code to tell urllib
    # not to do that
    return response
示例#5
0
def save_url(url, name, ext, status, part=None, reporthook=simple_hook):
    if part is None:
        print("Download: " + name)
        name = name + '.' + ext
        part = 0
    else:
        print("Download: " + name + " part %d" % part)
        name = name + '_%d_.' % part + ext
    bs = 1024 * 8
    size = -1
    read = 0
    blocknum = 0
    open_mode = 'wb'
    req = Request(url, headers=fake_headers)
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        req.add_header('Range', 'bytes=%d-' % filesize)
        response = urlopen(req, None)
        if response.status == 206:
            size = int(response.headers['Content-Range'].split('/')[-1])
            if filesize == size:
                print('Skipped: file already downloaded')
                status[part] = 1
                return
            if filesize < size:
                if filesize:
                    blocknum = int(filesize / bs)
                open_mode = 'ab'
    else:
        response = urlopen(req, None)
    if size < 0:
        size = int(response.headers.get('Content-Length', -1))
    reporthook(blocknum, bs, size)
    with open(name, open_mode) as tfp:
        while True:
            block = response.read(bs)
            if not block:
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            reporthook(blocknum, bs, size)
    if os.path.exists(name):
        filesize = os.path.getsize(name)
        if filesize == size:
            status[part] = 1
示例#6
0
def get_content(url, headers=fake_headers, data=None, charset=None):
    """Gets the content of a URL via sending a HTTP GET request.

    Args:
        url: A URL.
        headers: Request headers used by the client.
        decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.

    Returns:
        The content as a string.
    """
    logger.debug("get_content> URL: " + url)
    req = Request(url, headers=headers, data=data)
    #if cookies_txt:
    #    cookies_txt.add_cookie_header(req)
    #    req.headers.update(req.unredirected_hdrs)
    response = urlopen(req)
    data = response.read()

    # Handle HTTP compression for gzip and deflate (zlib)
    resheader = response.info()
    if 'Content-Encoding' in resheader:
        content_encoding = resheader['Content-Encoding']
    elif hasattr(resheader, 'get_payload'):
        payload = resheader.get_payload()
        if isinstance(payload, str):
            content_encoding = match1(payload, r'Content-Encoding:\s*([\w-]+)')
        else:
            content_encoding = None
    else:
        content_encoding = None
    if content_encoding == 'gzip':
        data = ungzip(data)
    elif content_encoding == 'deflate':
        data = undeflate(data)

    if charset == 'ignore':
        return data

    # Decode the response body
    if charset is None:
        if 'Content-Type' in resheader:
            charset = match1(resheader['Content-Type'], r'charset=([\w-]+)')
        charset = charset or match1(str(data), r'charset=\"([\w-]+)',
                                    'charset=([\w-]+)') or 'utf-8'
    logger.debug("get_content> Charset: " + charset)
    try:
        data = data.decode(charset, errors='replace')
    except:
        logger.warning("wrong charset for {}".format(url))
    return data
示例#7
0
def get_head_response(url, headers=fake_headers):
    logger.debug('get_head_response> URL: ' + url)
    try:
        req = Request(url, headers=headers, method='HEAD')
        response = urlopen(req)
    except IOError as e:
        # if HEAD method is not supported
        if match1(str(e), 'HTTP Error (40[345])'):
            logger.debug('get_head_response> HEAD failed, try GET')
            response = get_response(url, headers=headers)
            response.close()
        else:
            raise
    # urllib will follow redirections and it's too much code to tell urllib
    # not to do that
    return response
示例#8
0
文件: html.py 项目: ytwangli/ykdl
def get_content(url, headers=fake_headers, data=None, charset=None):
    """Gets the content of a URL via sending a HTTP GET request.

    Args:
        url: A URL.
        headers: Request headers used by the client.
        decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.

    Returns:
        The content as a string.
    """

    req = Request(url, headers=headers, data=data)
    #if cookies_txt:
    #    cookies_txt.add_cookie_header(req)
    #    req.headers.update(req.unredirected_hdrs)
    response = urlopen(req)
    data = response.read()

    # Handle HTTP compression for gzip and deflate (zlib)
    resheader = response.info()
    if 'Content-Encoding' in resheader:
        content_encoding = resheader['Content-Encoding']
    else:
        content_encoding = None
    if content_encoding == 'gzip':
        data = ungzip(data)
    elif content_encoding == 'deflate':
        data = undeflate(data)

    if charset == 'ignore':
        return data

    # Decode the response body
    if charset is None:
        charset = match1(resheader['Content-Type'], r'charset=([\w-]+)') or \
              match1(str(data), r'charset=\"([^\"]+)', 'charset=([^"]+)') or 'utf-8'
    try:
        data = data.decode(charset)
    except:
        from .log import w
        w("wrong charset for {}".format(url))
    return data
示例#9
0
文件: html.py 项目: devount/ykdl
def get_location(url, headers = fake_headers):
    response = urlopen(Request(url, headers = headers))
    # urllib will follow redirections and it's too much code to tell urllib
    # not to do that
    return response.geturl()
示例#10
0
def _save_url(url, name, ext, status, part=None, reporthook=multi_hook):
    def print(*args, **kwargs):
        reporthook(['print', args, kwargs])

    def read_response(bs):
        if size > 0:
            # a independent timeout for read response
            rd, _, ed = select.select([fd], [], [fd], timeout)
            if ed:
                raise socket.error(ed)
            if not rd:
                raise socket.timeout('The read operation timed out')
        return response.read(bs)

    if part is None:
        name = name + '.' + ext
        part = 0
    else:
        name = '%s_%d.%s' % (name, part, ext)
    bs = 8192
    size = -1
    filesize = 0
    downloaded = 0
    open_mode = 'wb'
    response = None
    timeout = max(socket.getdefaulttimeout() or 0, 60)
    req = Request(url, headers=fake_headers)
    try:
        reporthook(['part'], part=part)
        if os.path.exists(name):
            filesize = os.path.getsize(name)
            if filesize:
                req.add_header('Range', 'bytes=%d-' %
                               (filesize - 1))  # get +1, avoid 416
                response = urlopen(req, None)
                set_rcvbuf(response)
                if response.status == 206:
                    size = int(
                        response.headers['Content-Range'].split('/')[-1])
                    needless_size = 1
                elif response.status == 200:
                    size = int(response.headers.get('Content-Length', -1))
                    needless_size = filesize
                if filesize == size:
                    print('Skipped: file part %d has already been downloaded' %
                          part)
                    status[part] = 1
                    return True
                if filesize < size:
                    percent = int(filesize * 100 / size)
                    open_mode = 'ab'
                    print('Restored: file part %d is incomplete at %d%%' %
                          (part, percent))
                    reporthook(['part'], filesize, size, part)
                    fd = response.fileno()
                    while needless_size > 0:
                        if needless_size > bs:
                            block = read_response(bs)
                        else:
                            block = read_response(needless_size)
                        if not block:
                            return
                        needless_size -= len(block)
        if response is None:
            response = urlopen(req, None)
            set_rcvbuf(response)
            fd = response.fileno()
        if size < 0:
            size = int(response.headers.get('Content-Length', -1))
        with open(name, open_mode) as tfp:
            while size < 0 or filesize < size:
                block = read_response(bs)
                if not block:
                    break
                n = tfp.write(block)
                downloaded += n
                filesize += n
                reporthook(['part'], filesize, size, part)
        if os.path.exists(name):
            filesize = os.path.getsize(name)
            if filesize and (size < 0 or filesize == size):
                status[part] = 1
                return True
    finally:
        time.sleep(1)
        reporthook(['part end', status, downloaded], filesize, size, part)
示例#11
0
def get_response(url, headers=fake_headers, data=None):
    req = Request(url, headers=headers, data=data)
    #if cookies_txt:
    #    cookies_txt.add_cookie_header(req)
    #    req.headers.update(req.unredirected_hdrs)
    return urlopen(req)
示例#12
0
def save_url(url, name, ext, status, part=None, reporthook=simple_hook):
    if part is None:
        print("Download: " + name)
        name = name + '.' + ext
    else:
        print("Download: " + name + " part %d" % part)
        name = name + '_%d_.' % part + ext
    bs = 1024 * 8
    size = -1
    read = 0
    blocknum = 0
    open_mode = 'ab+'
    req = Request(url, headers=fake_headers)
    try:
        url = 'http://' + url.split("://")[-1]
        hasproxy = 0
        retry = 0
        while True:
            while 1:
                if hasproxy:
                    proxy = {'http': get_proxy()}
                else:
                    proxy = 0
                try:
                    r = requests.get(url,
                                     proxies=proxy,
                                     stream=True,
                                     timeout=(5, 8))
                    if r.status_code == 200 or r.status_code == 404:
                        break
                    else:
                        r.close()
                        raise Exception(r.status_code)
                except:
                    hasproxy = 1
                    if retry < 10:
                        retry += 1
                        continue
                    else:
                        break
            reporthook(blocknum, bs, size, name)
            tfp = open(name, open_mode)
            for chunk in r.iter_content(chunk_size=bs):
                if (chunk):
                    tfp.write(chunk)
                    blocknum += 1
                    if blocknum % 100 == 0:
                        reporthook(blocknum, bs, size, name)
                    if (blocknum >= 131072):
                        tfp.close()
                        r.close()
                        print('文件大小达到限制,结束')
                        #os.system('mv "{}" /root/b/'.format(name))
                        shutil.move(name, '/root/b/')
                        namepart = name.split('-', 1)
                        name = time.strftime(
                            '%y%m%d_%H%M%S') + "-" + namepart[-1]
                        blocknum = 0
                        break
                        #tfp = open(name, open_mode)
                else:
                    print(name, "无 chunk")
                    break
            break
        if os.path.exists(name):
            filesize = os.path.getsize(name)
            if filesize == size:
                if part is None:
                    status[0] = 1
                else:
                    status[part] = 1
    except:
        traceback.print_exc()
    finally:
        if "r" in locals():
            r.close()
        if "tfp" in locals():
            tfp.close()
        if os.path.exists(name):
            filesize = os.path.getsize(name)
            if filesize < 1024 * 600:
                print(name, "大小不足1mb,删除")
                os.remove(name)
            else:
                #os.system('mv "{}" /root/b/'.format(name))
                shutil.move(name, '/root/b/')