def readPageGallery(self, search_key, page=0): """ :param page: 初始化翻页数,递归用 :param search_key: 查询关键字 :return: """ root_folder = self.folder_path + search_key Soup.create_folder(root_folder) print('max_page=', page) if page <= 1: page_url = self.search_page_url.replace(':key', search_key) else: page_url = self.search_page_url.replace( ':key', search_key) + str(page) + '.html' print('max_page', page, 'page_url', page_url) soup_html = Soup.get_soup(page_url) is_404 = soup_html.find('div', 'listdiv').get_text() print(is_404) if '页面不存在' in is_404: print('没有了,over') return else: self.readPageSearchThread(soup_html, root_folder) page = page + 1 print(page) self.readPageGallery(search_key, page)
def readPageOne(self, page_one_key, root_folder=None): """ 根据输入key读取搜索页面 :param page_one_key: 24816 (代表https://www.nvshens.com/g/24816/) :param root_folder: 根目录 :return: """ sleep(self.sleep_time) if root_folder is None: root_folder = self.folder_path # 创建文件夹 /magnet/search_key # path = root_folder + page_one_key # 打开搜索页面第1页 page_url = self.one_page_url.replace(':key', page_one_key) print(page_url) soup_html = Soup.get_soup(page_url) htitle = soup_html.find("h1", {'id': 'htilte'}).get_text() path = root_folder + '/(' + page_one_key + ')' + htitle print(path) Soup.create_folder(path) text_page = soup_html.find("div", { 'id': 'dinfo' }).find('span').get_text() print('text_page', text_page) last = text_page.replace('张照片', '') item_size = int(last) # 第1张图片 image_one = soup_html.find("ul", {'id': 'hgallery'}).find('img') image_one_url = image_one.get('src') print('image_one_url', image_one_url) # 第2张图片链接作为模版 image_two = image_one.find_next_sibling() image_two_url = image_two.get('src') print('image_two_url', image_two_url) # 第1张 <img src="https://img.onvshen.com:85/gallery/25366/24816/0.jpg"> # 第2张 <img src="https://img.onvshen.com:85/gallery/25366/24816/001.jpg"> # 第3张 <img src="https://img.onvshen.com:85/gallery/25366/24816/002.jpg"> print('item_size=====', item_size) img_hz = image_two_url.split("/")[-1] file_hz = img_hz.split('.')[1] img_mod_url = image_two_url.replace(img_hz, '') print('img_hz', img_hz, 'file_hz', file_hz, 'img_mod_url', img_mod_url) # 直接写0张 self.readPagetoTxt(image_one_url, path + '/0.' + file_hz, self.sleep_time) # 接下去,从第1张开始 self.readPageByThread(item_size, path, img_mod_url, file_hz)
def write_img(self, page_url, path, _time): # 先等待几秒,防反爬 sleep(_time) try: # 使用Request添加头部的方法,读取图片链接再写入,最重要的是加上Referer Soup.write_img(page_url, path, referer=self.index_page_url) except BaseException as msg: print(msg)
def readPagetoTxt(self, page_url, path, _time): # 先等待几秒,防反爬 sleep(_time) # 使用此方法,下载图片为盗链 # urllib.request.urlretrieve(page_url, path) # 使用Request添加头部的方法,读取图片链接再写入,最重要的是加上Referer Soup.write_img(page_url, path, referer=self.index_page_url) '''
def readPageSearch(self, search_key): """ 单页读取,根据输入key读取搜索页面,找寻所有的galleryli_link :param search_key: jiemeihua (代表https://www.nvshens.com/gallery/jiemeihua/) :return: """ root_folder = self.folder_path + search_key Soup.create_folder(root_folder) page_url = self.search_page_url.replace(':key', search_key) print(page_url) soup_html = Soup.get_soup(page_url) self.readPageSearchThread(soup_html, root_folder)
def readPageFromSearch(self, search_key): """ 根据输入key读取搜索页面 :param search_key: :return: """ # 创建文件夹 /magnet/search_key path = self.folder_path + search_key Soup.create_folder(path) # 打开搜索页面第1页 page_url = self.one_page_url.replace(':key', search_key) print(page_url) soup_html = Soup.get_soup(page_url) text_page = soup_html.find("div", { 'id': 'dinfo' }).find('span').get_text() print('text_page', text_page) last = text_page.replace('张照片', '') item_size = int(last) # 第1张图片 image_one = soup_html.find("ul", {'id': 'hgallery'}).find('img') image_one_url = image_one.get('src') print('image_one_url', image_one_url) # 第2张图片链接作为模版 image_two = image_one.find_next_sibling() image_two_url = image_two.get('src') print('image_two_url', image_two_url) # 第1张 <img src="https://img.onvshen.com:85/gallery/25366/24816/0.jpg"> # 第2张 <img src="https://img.onvshen.com:85/gallery/25366/24816/001.jpg"> # 第3张 <img src="https://img.onvshen.com:85/gallery/25366/24816/002.jpg"> print('item_size=====', item_size) img_hz = image_two_url.split("/")[-1] file_hz = img_hz.split('.')[1] img_mod_url = image_two_url.replace(img_hz, '') print('img_hz', img_hz, 'file_hz', file_hz, 'img_mod_url', img_mod_url) # 直接写0张 self.readPagetoTxt(image_one_url, path + '/0.' + file_hz, self.sleep_time) # 接下去,从第1张开始 self.readPageByThread(item_size, path, img_mod_url, file_hz)
def readPageSearch(self, search_key): """ 根据输入key读取搜索页面,找寻所有的galleryli_link :param search_key: jiemeihua (代表https://www.nvshens.com/gallery/jiemeihua/) :return: """ root_folder = self.folder_path + search_key Soup.create_folder(root_folder) page_url = self.search_page_url.replace(':key', search_key) print(page_url) soup_html = Soup.get_soup(page_url) a_lists = soup_html.find_all("a", {'class': 'galleryli_link'}) threads = [] # n = 0 for a in a_lists: page_one_key = a.get('href').split('/')[2] print('page_one_key', page_one_key) # self.readPageOne(page_one_key, root_folder) t = MyThread(self.readPageOne, (page_one_key, root_folder), self.readPageOne.__name__) threads.append(t) ''' # 测试用代码,多线程以下跑太多了 n = n + 1 if n > 2: break ''' for t in threads: t.start() for t in threads: t.join() print('all end', ctime())