Пример #1
0
def find_image_urls(page_url):
    # 查找某个图片页面的所有图片URL
    soup = commons.soup(page_url, encoding='gbk')
    # print('process page', page_url)
    images = soup.find_all(image_url_pattern)
    return [img.get('src') for img in images]
    return filter(None, images)
Пример #2
0
def find_album_urls_by_tag(url):
    # 查找某个TAG页面的所有专辑URL
    # print('finding albums in {0}'.format(url))
    soup = commons.soup(url, encoding='gbk')
    pages = soup.find_all(page_url_pattern)
    return [p.get('href') for p in pages]
    return filter(None, pages)
Пример #3
0
def find_album_urls_by_tag(url):
    # 查找某个TAG页面的所有专辑URL
    # print('finding albums in {0}'.format(url))
    soup = commons.soup(url, encoding='gbk')
    pages = soup.find_all(page_url_pattern)
    return [p.get('href') for p in pages]
    return filter(None, pages)
Пример #4
0
def find_image_urls(page_url):
    # 查找某个图片页面的所有图片URL
    soup = commons.soup(page_url, encoding='gbk')
    # print('process page', page_url)
    images = soup.find_all(image_url_pattern)
    return [img.get('src') for img in images]
    return filter(None, images)
Пример #5
0
def get_taotu_pages(category_url):
    # 找到某个分类下全部的分页URL
    print('process category: {0}'.format(category_url))
    soup = commons.soup(category_url, encoding='utf8')
    print('process index: {0}'.format(soup.title))
    last_no = get_last_page_no(soup)
    urls = ['{0}/list_{1}.html'.format(category_url, i) for i in range(2, last_no + 1)]
    # for url in urls:
    # download_by_page(url)
    retry = 0
    while True:
        pool = ThreadPool(4)
        try:
            pool.map(download_by_page, urls)
            pool.close()
            pool.join()
            print('all images downloaded completely.')
            break
        except KeyboardInterrupt, e:
            print('download terminated by user, quit now.', e)
            pool.terminate()
            pool.join()
            break
        except Exception, e:
            pool.terminate()
            pool.join()
            retry += 1
            traceback.print_exc()
            try:
                print('download error: {0}, {1} retry in {2}s'.format(
                    e, retry, retry * 20 % 120))
            except Exception:
                pass
            time.sleep(retry * 20 % 120)
Пример #6
0
def get_image_urls_for_taotu(url):
    # 找套图某一个页面包含的图片URL
    soup = commons.soup(url, encoding='utf8')
    imgs = soup.select('#big-pic')[0].find_all('img')
    urls = [img.get('src') for img in imgs]
    print('found {0} images in {1}'.format(len(urls), url))
    return urls
Пример #7
0
def print_css_links(url):
    soup = commons.soup(url)
    raw_css_urls = [
        link["href"] for link in soup.findAll("link")
        if "stylesheet" in link.get("rel", [])
    ]
    css_urls = [
        u'https:%s' % url if url.startswith(u'//') else url
        for url in raw_css_urls
    ]
Пример #8
0
def find_album_pages(album_url):
    # 查找某个专辑页面的所有分页URL
    soup = commons.soup(album_url, encoding='gbk')
    title = soup.title
    album = os.path.basename(album_url)[:-4]
    print('process album', album_url)
    links = soup.find_all(href=re.compile('{0}'.format(album)))
    page_url_pattern = album_url.replace(album, '{0}')[:-4]
    # print(page_url_pattern)
    pages = [page_url_pattern.format(l.get('href')) for l in links]
    return title.text, set(filter(None, pages))
Пример #9
0
def find_all_tugua_urls(page):
    url = url_tpl.format(page)
    print('fetch {0}'.format(url))
    soup = commons.soup(url, encoding='gbk')
    links = soup.find_all(filter_page_url)
    if links:
        print('fetch {0} urls in page {1}'.format(len(links), page))
        lock.acquire()
        urls.extend(links)
        lock.release()
    return links
Пример #10
0
def find_album_pages(album_url):
    # 查找某个专辑页面的所有分页URL
    soup = commons.soup(album_url, encoding='gbk')
    title = soup.title
    album = os.path.basename(album_url)[:-4]
    print('process album', album_url)
    links = soup.find_all(href=re.compile('{0}'.format(album)))
    page_url_pattern = album_url.replace(album, '{0}')[:-4]
    # print(page_url_pattern)
    pages = [page_url_pattern.format(l.get('href')) for l in links]
    return title.text, set(filter(None, pages))
Пример #11
0
def get_all_tags():
    # 获取所有标签,去重
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    tags_file = os.path.join(OUTPUT_DIR, 'tags.dat')
    if os.path.isfile(tags_file):
        with open(tags_file, 'rb') as f:
            print('found tags cache, skip fetch remote tags')
            return pickle.load(f)
    # get tags from tags page
    url = 'http://www.umei.cc/tags/'
    soup = commons.soup(url, encoding='gbk')
    urls = soup.find_all(href=tag_pattern)
    tags = [tag_pattern.match(a.get('href')).group(1) for a in urls]
    # get tags from index page
    url = 'http://www.umei.cc/'
    soup = commons.soup(url, encoding='gbk')
    urls = soup.find_all(href=tag_pattern)
    tags.extend([tag_pattern.match(a.get('href')).group(1) for a in urls])
    with open(tags_file, 'wb') as f:
        pickle.dump(tags, f)
    return sorted(set(tags))
Пример #12
0
def get_all_tags():
    # 获取所有标签,去重
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    tags_file = os.path.join(OUTPUT_DIR, 'tags.dat')
    if os.path.isfile(tags_file):
        with open(tags_file, 'rb') as f:
            print('found tags cache, skip fetch remote tags')
            return pickle.load(f)
    # get tags from tags page
    url = 'http://www.umei.cc/tags/'
    soup = commons.soup(url, encoding='gbk')
    urls = soup.find_all(href=tag_pattern)
    tags = [tag_pattern.match(a.get('href')).group(1) for a in urls]
    # get tags from index page
    url = 'http://www.umei.cc/'
    soup = commons.soup(url, encoding='gbk')
    urls = soup.find_all(href=tag_pattern)
    tags.extend([tag_pattern.match(a.get('href')).group(1) for a in urls])
    with open(tags_file, 'wb') as f:
        pickle.dump(tags, f)
    return sorted(set(tags))
Пример #13
0
def download_page(item):
    id = item['id']
    url = page_tpl.format(id)
    # 如果已下载,跳过
    filename = os.path.join(OUTPUT, '{}.html'.format(id))
    if os.path.exists(filename):
        print('skip page {0}'.format(url))
        return
    print('download page: {0}'.format(url))
    # 必须用gbk,要不然繁体乱码
    # 虽然网页上写的是gb2312,但是浏览器实际使用的是gbk
    soup = commons.soup(url, encoding='gbk')
    # 获取所有的图片URL
    imgs = soup.find_all('img')
    # 图片保存目录
    img_dirname = 'images_{0}'.format(id)
    imgdir = os.path.join(OUTPUT, img_dirname)
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)
    # 逐个下载图片
    for img in imgs:
        from_src = img['src']
        # 跳过没有扩展名的图片
        if not os.path.splitext(from_src)[1]:
            continue
        if not from_src.startswith('http://'):
            # 给部分不是完整的图片URL添加域名部分
            from_src = 'http://www.dapenti.com/blog/{0}'.format(from_src)
        # 过滤不合法的文件名字符
        to_src = commons.get_safe_filename(from_src)
        imgfile = os.path.join(imgdir, to_src)
        # 替换为本地图片链接
        img['src'] = os.path.join(img_dirname, to_src)
        if os.path.exists(imgfile):
            # 跳过已存在的图片
            print('skip exists image {0}'.format(from_src))
        else:
            # 不存在则下载
            iurl, iname = download_image(from_src, imgfile, id)
            if not iname:
                # 如果图片无法下载,保留原始URL
                img['src'] = from_src
    tempfile = '{0}.tmp'.format(filename)
    # 如果正文和图片都下载完成,没有错误,则保存到文件
    with open(tempfile, 'w') as f:
        # 用utf写入文件,所以html头的gb2312需要改为utf8
        content = unicode(soup).replace('charset=gb2312', 'charset=utf-8')
        f.write(content.encode('utf8'))
    commons.safe_rename(tempfile, filename)
    print('page saved {0}'.format(url))
Пример #14
0
def download_taotu_images(turl, output=OUTPUT):
    # 下载某个套图的全部图片
    # http://www.aitaotu.com/guonei/5044.html
    # http://www.aitaotu.com/guonei/5044_10.html
    base_url = os.path.dirname(turl)
    page_no = os.path.basename(turl)[:-5]

    # 图片保存目录,用套图的No序号做目录名字
    img_dir = os.path.join(output, page_no)
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)

    # 如果发现 .dat 文件,表明这个套图已经全部下载完成,跳过
    stat_file = os.path.join(output, '{0}.dat'.format(page_no))
    if os.path.isfile(stat_file):
        print('skip done page: {0}'.format(turl))
        return

    print('downloading page: {0}'.format(turl))

    soup = commons.soup(turl, encoding='utf8')
    # print('process page: {0}'.format(soup.title))
    images = []
    # 找到这个套图所有页面的图片URL
    for i in range(2, get_last_page_no(soup) + 1):
        purl = '{0}/{1}_{2}.html'.format(base_url, page_no, i)
        images.extend(get_image_urls_for_taotu(purl))
    # 逐个下载图片
    for iurl in images:
        img_name = os.path.basename(iurl)
        img_file = os.path.join(img_dir, img_name)
        if os.path.isfile(img_file):
            # 如果图片已存在,跳过
            print('{0} skip image {1}'.format(page_no, img_file))
        else:
            # 不存在则下载
            print('{0} downloading {1}'.format(page_no, iurl))
            commons.download_file(iurl, img_file)
    # 没有发生异常全部下载完成,则保存状态文件
    with open(stat_file, 'wb') as f:
        pickle.dump(images, f)
    print('downloaded, save stat {0}'.format(turl))
Пример #15
0
def print_css_links(url):
    soup = commons.soup(url)
    raw_css_urls = [link["href"] for link in soup.findAll("link") if "stylesheet" in link.get("rel", [])]
    css_urls = [u'https:%s' % url if url.startswith(u'//') else url for url in raw_css_urls]
Пример #16
0
def find_album_urls_by_index(url):
    # 查找某个列表页面的所有专辑URL
    # print('finding albums in {0}'.format(url))
    soup = commons.soup(url, encoding='gbk')
    pages = soup.find_all(page_url_pattern)
    return [urljoin(DOMAIN, p.get('href')) for p in pages]
Пример #17
0
def find_album_urls_by_index(url):
    # 查找某个列表页面的所有专辑URL
    # print('finding albums in {0}'.format(url))
    soup = commons.soup(url, encoding='gbk')
    pages = soup.find_all(page_url_pattern)
    return [urljoin(DOMAIN, p.get('href')) for p in pages]
Пример #18
0
def get_taotu_urls_for_page(url):
    # 找到某个页面包含的全部套图URL
    soup = commons.soup(url, encoding='utf8')
    links = soup.find_all(filter_taotu_url)
    return [urljoin(DOMAIN, l.get('href')) for l in links]