예제 #1
0
    'Accept-Encoding': 'gzip',
}  #初始使用的header
#种子 13页
URL_1024 = 'http://1024.stv919.pw/pw/thread.php?fid=22?fid=15&page=2'  #22有
#http://1024.stv919.pw/pw/thread.php?fid=14&page=7
URL_youma = 'http://1024.stv919.pw/pw/thread.php?fid=22&page='  #到了13页
URL_wm = 'http://1024.stv919.pw/pw/thread.php?fid=7&page='  #到了13页
for num in range(4, 111):
    start_html = requests.get(URL_wm + str(num), headers=headers)
    start_html.encoding = 'utf-8'
    bsObj = BeautifulSoup(start_html.text, 'html.parser')
    writerLog(URL_wm + str(num))
    for a in bsObj.find("tbody", {
            "style": "table-layout:fixed;"
    }).findAll("a"):
        if ('href' in a.attrs) and ('title' not in a.attrs):
            if re.match(r'^htm_data/.+.html', a.attrs['href']):
                a_path = get_format_filename(a.text)  #的到连接中文名称
                if not os.path.exists(os.path.join(file_path_wuma, a_path)):
                    os.makedirs(os.path.join(file_path_wuma, a_path))
                    os.chdir(file_path_wuma + '\\' + a_path)  #切换到上面创建的文件夹
                    f = open(a_path + '.txt', 'w')  # r只读,w可写,a追加
                    f.write(get_inner_link(a.attrs['href']))
                    f.close()
                    Process_SubPage(file_path_wuma + '\\' + a_path,
                                    a.attrs['href'])  #处理子页面,包括下载图片,种子
                    #                     print(get_inner_link(a.attrs['href']))

                    print(a_path + ':处理完毕')
            # time.sleep(0.5)#设置等待还是会被服务器封禁
예제 #2
0
file_path = 'D:\MyProjectFile\Python\studyproject\Python3\StudyPro1\datebase'  #存储的地址
headers = {
    'User-Agent':
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    'Accept':
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    'Accept-Encoding': 'gzip',
}  #初始使用的header
URL_1024 = 'http://1024.91lulea.click/pw/thread.php?fid=22&page=4'
#网站地址中‘日本骑兵’系列,别的系列我没有测试,不保证正确 #刚测试了,‘亚洲无码’也可以,估计所有系列的网页HTML格式是一样的

start_html = requests.get(URL_1024, headers=headers)
start_html.encoding = 'utf-8'
bsObj = BeautifulSoup(start_html.text, 'html.parser')
for a in bsObj.find("tbody", {"style": "table-layout:fixed;"}).findAll("a"):
    if ('href' in a.attrs) and ('title' not in a.attrs):
        if re.match(r'^htm_data/.+.html', a.attrs['href']):
            a_path = get_format_filename(a.text)
            if not os.path.exists(os.path.join(file_path, a_path)):
                os.makedirs(os.path.join(file_path, a_path))
            os.chdir(file_path + '\\' + a_path)  #切换到上面创建的文件夹
            f = open(a_path + '.txt', 'w')  # r只读,w可写,a追加
            f.write(get_inner_link(a.attrs['href']))
            f.close()
            Process_SubPage(file_path + '\\' + a_path,
                            a.attrs['href'])  #处理子页面,包括下载图片,种子
            print(get_inner_link(a.attrs['href']))
            print(a_path + ':处理完毕')
            # time.sleep(0.5)#设置等待还是会被服务器封禁
예제 #3
0
        }).findAll("h3"):
            attrs = a.find("a").attrs['href']
            # if re.match(r'^htm_data/.+.html', attrs):
            print(attrs)
            # 取种子名
            seedStr = get_inner_link(attrs)
            seed_html = requests.get(seedStr, headers=get_image_header())
            seed_html.encoding = 'utf-8'
            seedObj = BeautifulSoup(seed_html.text, 'html.parser')
            for seed_a in seedObj.find("div", {"id": "read_tpc"}).findAll("a"):
                if re.match(r'^http://www?\d+.+.html$', seed_a.attrs['href']):
                    seedUrl = seed_a.attrs['href']
                    seedNum = seedUrl[-12:-5]
                    print(seedNum)

                    a_path = get_format_filename(a.text)  # 构建本地文件路径,影片名

                    if not os.path.exists(os.path.join(filePath, seedNum)):
                        os.makedirs(os.path.join(filePath, seedNum))
                    os.chdir(filePath + '/' + seedNum)  # 切换到上面创建的文件夹
                    f = open(seedNum + '.txt', 'w')  # r只读,w可写,a追加
                    f.write(a_path)
                    f.close()
                    Process_SubPage(filePath + '/' + seedNum,
                                    attrs)  # 处理子页面,包括下载图片,种子
                    print(get_inner_link(attrs))
                    print(a_path + ':处理完毕')
                    time.sleep(0.5)  # 设置等待还是会被服务器封禁

                    # # 种子页网址 爬取种子
                    # seedStr = get_inner_link(a.attrs['href'])