Exemplo n.º 1
0
def downLoadImg(source_url, content_html):
    # 处理图片
    selector = Selector(text=content_html)
    # 解析文档中的所有图片url,然后替换成标识
    image_urls = []
    imgs = selector.xpath(u'descendant::img')

    for img in imgs:
        # 图片可能放在src 或者data-src
        image_url_base = img.xpath(u'@src').extract_first('')
        if not image_url_base:
            continue
        if image_url_base.startswith(u'//'):
            image_url = u'http:' + image_url_base
        elif image_url_base.startswith(u'/'):
            image_url = getNetLoc(source_url) + image_url_base
        elif image_url_base.startswith(u'./'):
            # 得到当前url结尾的最后一个 /之前的字符串
            base_url = source_url[0:source_url.rindex(u'/')] + u'/'
            image_url = image_url_base.replace(u'./', base_url)
        elif image_url_base.startswith(u'../../'):
            image_url = image_url_base.replace(u'../../',
                                               getNetLoc(source_url) + u'/')
        elif image_url_base.startswith(u'http'):
            image_url = image_url_base
        else:
            base_url = source_url[0:source_url.rindex(u'/')] + u'/'
            image_url = base_url + image_url_base
        if image_url and image_url.startswith(u'http'):
            print(u'得到图片:' + image_url)
            image_urls.append({
                u'url': image_url,
            })
            content_html = content_html.replace(image_url_base, image_url)

    # TODO..先不下载
    image_urls = []
    result_image_urls = ImageUtil.downLoadImage(image_urls)
    for item in result_image_urls:
        url = item.get(u'url', u'')
        image_url = item.get(u'image_url', u'')
        content_html = content_html.replace(u'&',
                                            u'&').replace(url, image_url)
    return content_html