예제 #1
0
    def readPageGallery(self, search_key, page=0):
        """
        :param page: 初始化翻页数,递归用
        :param search_key: 查询关键字
        :return:
        """
        root_folder = self.folder_path + search_key
        Soup.create_folder(root_folder)

        print('max_page=', page)

        if page <= 1:
            page_url = self.search_page_url.replace(':key', search_key)
        else:
            page_url = self.search_page_url.replace(
                ':key', search_key) + str(page) + '.html'

        print('max_page', page, 'page_url', page_url)
        soup_html = Soup.get_soup(page_url)

        is_404 = soup_html.find('div', 'listdiv').get_text()
        print(is_404)

        if '页面不存在' in is_404:
            print('没有了,over')
            return
        else:
            self.readPageSearchThread(soup_html, root_folder)
            page = page + 1
            print(page)
            self.readPageGallery(search_key, page)
예제 #2
0
    def readPageOne(self, page_one_key, root_folder=None):
        """
       根据输入key读取搜索页面
       :param page_one_key: 24816 (代表https://www.nvshens.com/g/24816/)
       :param root_folder: 根目录
       :return:
       """
        sleep(self.sleep_time)

        if root_folder is None:
            root_folder = self.folder_path

        # 创建文件夹 /magnet/search_key
        # path = root_folder + page_one_key

        # 打开搜索页面第1页
        page_url = self.one_page_url.replace(':key', page_one_key)
        print(page_url)
        soup_html = Soup.get_soup(page_url)

        htitle = soup_html.find("h1", {'id': 'htilte'}).get_text()
        path = root_folder + '/(' + page_one_key + ')' + htitle
        print(path)
        Soup.create_folder(path)

        text_page = soup_html.find("div", {
            'id': 'dinfo'
        }).find('span').get_text()
        print('text_page', text_page)
        last = text_page.replace('张照片', '')
        item_size = int(last)

        # 第1张图片
        image_one = soup_html.find("ul", {'id': 'hgallery'}).find('img')
        image_one_url = image_one.get('src')
        print('image_one_url', image_one_url)

        # 第2张图片链接作为模版
        image_two = image_one.find_next_sibling()
        image_two_url = image_two.get('src')
        print('image_two_url', image_two_url)
        # 第1张 <img src="https://img.onvshen.com:85/gallery/25366/24816/0.jpg">
        # 第2张 <img src="https://img.onvshen.com:85/gallery/25366/24816/001.jpg">
        # 第3张 <img src="https://img.onvshen.com:85/gallery/25366/24816/002.jpg">

        print('item_size=====', item_size)

        img_hz = image_two_url.split("/")[-1]
        file_hz = img_hz.split('.')[1]
        img_mod_url = image_two_url.replace(img_hz, '')

        print('img_hz', img_hz, 'file_hz', file_hz, 'img_mod_url', img_mod_url)

        # 直接写0张
        self.readPagetoTxt(image_one_url, path + '/0.' + file_hz,
                           self.sleep_time)
        # 接下去,从第1张开始
        self.readPageByThread(item_size, path, img_mod_url, file_hz)
예제 #3
0
    def write_img(self, page_url, path, _time):

        # 先等待几秒,防反爬
        sleep(_time)

        try:
            # 使用Request添加头部的方法,读取图片链接再写入,最重要的是加上Referer
            Soup.write_img(page_url, path, referer=self.index_page_url)
        except BaseException as msg:
            print(msg)
예제 #4
0
    def readPagetoTxt(self, page_url, path, _time):

        # 先等待几秒,防反爬
        sleep(_time)

        # 使用此方法,下载图片为盗链
        # urllib.request.urlretrieve(page_url, path)

        # 使用Request添加头部的方法,读取图片链接再写入,最重要的是加上Referer
        Soup.write_img(page_url, path, referer=self.index_page_url)
        '''
예제 #5
0
    def readPageSearch(self, search_key):
        """
       单页读取,根据输入key读取搜索页面,找寻所有的galleryli_link
       :param search_key: jiemeihua (代表https://www.nvshens.com/gallery/jiemeihua/)
       :return:
       """
        root_folder = self.folder_path + search_key
        Soup.create_folder(root_folder)

        page_url = self.search_page_url.replace(':key', search_key)
        print(page_url)
        soup_html = Soup.get_soup(page_url)
        self.readPageSearchThread(soup_html, root_folder)
예제 #6
0
    def readPageFromSearch(self, search_key):
        """
        根据输入key读取搜索页面
        :param search_key:
        :return:
        """

        # 创建文件夹 /magnet/search_key
        path = self.folder_path + search_key
        Soup.create_folder(path)

        # 打开搜索页面第1页
        page_url = self.one_page_url.replace(':key', search_key)
        print(page_url)
        soup_html = Soup.get_soup(page_url)

        text_page = soup_html.find("div", {
            'id': 'dinfo'
        }).find('span').get_text()
        print('text_page', text_page)
        last = text_page.replace('张照片', '')
        item_size = int(last)

        # 第1张图片
        image_one = soup_html.find("ul", {'id': 'hgallery'}).find('img')
        image_one_url = image_one.get('src')
        print('image_one_url', image_one_url)

        # 第2张图片链接作为模版
        image_two = image_one.find_next_sibling()
        image_two_url = image_two.get('src')
        print('image_two_url', image_two_url)
        # 第1张 <img src="https://img.onvshen.com:85/gallery/25366/24816/0.jpg">
        # 第2张 <img src="https://img.onvshen.com:85/gallery/25366/24816/001.jpg">
        # 第3张 <img src="https://img.onvshen.com:85/gallery/25366/24816/002.jpg">

        print('item_size=====', item_size)

        img_hz = image_two_url.split("/")[-1]
        file_hz = img_hz.split('.')[1]
        img_mod_url = image_two_url.replace(img_hz, '')

        print('img_hz', img_hz, 'file_hz', file_hz, 'img_mod_url', img_mod_url)

        # 直接写0张
        self.readPagetoTxt(image_one_url, path + '/0.' + file_hz,
                           self.sleep_time)
        # 接下去,从第1张开始
        self.readPageByThread(item_size, path, img_mod_url, file_hz)
예제 #7
0
    def readPageSearch(self, search_key):
        """
       根据输入key读取搜索页面,找寻所有的galleryli_link
       :param search_key: jiemeihua (代表https://www.nvshens.com/gallery/jiemeihua/)
       :return:
       """
        root_folder = self.folder_path + search_key
        Soup.create_folder(root_folder)

        page_url = self.search_page_url.replace(':key', search_key)
        print(page_url)
        soup_html = Soup.get_soup(page_url)

        a_lists = soup_html.find_all("a", {'class': 'galleryli_link'})

        threads = []
        # n = 0
        for a in a_lists:
            page_one_key = a.get('href').split('/')[2]
            print('page_one_key', page_one_key)
            # self.readPageOne(page_one_key, root_folder)
            t = MyThread(self.readPageOne, (page_one_key, root_folder),
                         self.readPageOne.__name__)
            threads.append(t)
            '''
             # 测试用代码,多线程以下跑太多了
            n = n + 1
            if n > 2:
                break
            '''

        for t in threads:
            t.start()
        for t in threads:
            t.join()

        print('all end', ctime())