def crawl(self, item, thread): match = re.search(r'nhentai.net/g/\d+', self.url) if not match: logger.info(" url not match") return None if 'https' not in self.url: self.url = 'https://' + self.url session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) try: logger.info("fetching " + self.url) r = session.get(self.url) item.cookies = r.cookies selector = etree.HTML(r.text) en_title = selector.xpath('//*[@id="info"]/h1/text()') sub_title = selector.xpath('//*[@id="info"]/h2/text()') item.titles = sub_title + en_title item.author = selector.xpath('//*[@id="tags"]/div[4]/span[1]/a/text()') item.tags = selector.xpath('//*[@id="tags"]/div[3]/span/a/text()') item.language = selector.xpath('//*[@id="tags"]/div[6]/span/a/text()') item.image_urls = selector.xpath('//*[@id="thumbnail-container"]/div/a/img/@data-src') item.image_urls = list(map(convert_url, item.image_urls)) item.source = self.url thread.progress = 0.05 return item except ConnectionError as e: print(e) return None
def crawl(self, item, thread): match = re.search(r'nhentai.net/g/\d+', self.url) if not match: logger.info(" url not match") return None if 'https' not in self.url: self.url = 'https://' + self.url session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) try: logger.info("fetching " + self.url) r = session.get(self.url) item.cookies = r.cookies selector = etree.HTML(r.text) en_title = selector.xpath('//*[@id="info"]/h1/text()') jp_title = selector.xpath('//*[@id="info"]/h2/text()') item.titles = jp_title + en_title item.author = selector.xpath('//*[@id="tags"]/div[4]/span[1]/a/text()') item.tags = selector.xpath('//*[@id="tags"]/div[3]/span/a/text()') item.language = selector.xpath('//*[@id="tags"]/div[6]/span/a/text()') item.image_urls = selector.xpath('//*[@id="thumbnail-container"]/div/a/img/@data-src') item.image_urls = list(map(convert_url, item.image_urls)) item.source = self.url thread.progress = 0.05 return item except ConnectionError as e: print(e) return None
def crawl(self, item, thread): match = re.search(r'www.wnacg.com/photos-index-aid-\d+.html', self.url) if not match: print('not match') return None session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) try: r = session.get(self.url) soup = BeautifulSoup(r.text, "html.parser") item.titles = [soup.select('.userwrap h2')[0].string] item.image_urls = [] item.image_urls += get_image_url(item, soup) page = int(soup.select('.f_left.paginator a')[-2].string) total_images_count = int(page) * len(soup.select('.li.gallary_item')) for i in range(2, page + 1): index_url = "http://www.wnacg.org/photos-index-page-%d-aid-%s.html" % (i, item.id) r = session.get(index_url) soup = BeautifulSoup(r.text, "html.parser") item.image_urls += get_image_url(item, soup) thread.progress = 0.10 * (len(item.image_urls) / total_images_count) return item except ConnectionError as e: print(e) return None
def crawl(self, item, thread): match = re.search(r'www.wnacg.com/photos-index-aid-\d+.html', self.url) if not match: print('not match') return None session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) try: r = session.get(self.url) soup = BeautifulSoup(r.text, "html.parser") item.titles = [soup.select('.userwrap h2')[0].string] item.image_urls = [] item.image_urls += get_image_url(item, soup) page = int(soup.select('.f_left.paginator a')[-2].string) total_images_count = int(page) * len( soup.select('.li.gallary_item')) for i in range(2, page + 1): index_url = "http://www.wnacg.org/photos-index-page-%d-aid-%s.html" % ( i, item.id) r = session.get(index_url) soup = BeautifulSoup(r.text, "html.parser") item.image_urls += get_image_url(item, soup) thread.progress = 0.10 * (len(item.image_urls) / total_images_count) return item except ConnectionError as e: print(e) return None
def generate(self, dir, thread, callback=None): self.epub = ComicEpub(dir) print('start to download image resources:') count = len(self.item.image_urls) thread.progress = 1 / (count + 1) session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) for (index, url) in enumerate(self.item.image_urls): print('[%d/%d] %s ' % (index + 1, count, url), end='') sys.stdout.flush() r = session.get(url) if r.ok: thread.progress = (index + 1 + 1) / (count + 1) print('[OK]') image_name = url.split('/')[-1] is_cover = (index == 0) name, ext = os.path.splitext(image_name) self.epub.add_comic_page(r.content, ext, is_cover) else: print('[FAIL]') return False print('download completed.') self.epub.title = (self.item.titles[0], self.item.titles[0]) self.epub.subjects = list(self.item.tags) self.epub.authors = [(self.item.author, self.item.author)] self.epub.publisher = ('Comicbook', 'Comicbook') if len(self.item.language) > 0: for language in self.item.language: if language == 'translated': continue self.epub.language = get_language_code(language) else: if len(self.item.titles) > 0 and ( '漢化' in self.item.titles[0] or '汉化' in self.item.titles[0] or '翻譯' in self.item.titles[0] ): self.epub.language = 'zh' print('epubify...') self.epub.save() print('work done.') if callback: callback(self.item)
def crawl(self, item, thread): match = re.search(r'wnacg\.org', self.url) if not match: logger.info(" url not match") return None if 'http' not in self.url: self.url = 'https://' + self.url session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua()}) session.proxies.update(config.PROXY) try: r = session.get(self.url) selector = etree.HTML(r.text) title = selector.xpath('//*[@id="bodywrap"]/h2/text()')[0] pages = [] img_urls = [] page = selector.xpath( '//*[@id="bodywrap"]/div[2]/div/ul/li[1]/div[1]/a')[0].get( 'href') pages.append(get_full_url(page)) while len(pages) == 1 or (len(pages) > 1 and pages[0] != pages[len(pages) - 1]): current_page = pages[len(pages) - 1] p = session.get(current_page) sel = etree.HTML(p.text) img_url = sel.xpath('//*[@id="picarea"]')[0].get('src') img_urls.append(img_url) next_page = sel.xpath('/html/body/div[8]/div/div/a[2]')[0].get( 'href') pages.append(get_full_url(next_page)) item.titles = [title] item.author = 'Unknown Author' item.tags = [] item.image_urls = list(map(lambda url: 'https:' + url, img_urls)) thread.progress = 0.05 return item except ConnectionError as e: print(e) return None
def crawl(self, item, thread): match = re.search(r'e-hentai.org\/g\/\d+\/\w+', self.url) if not match: print('not match') return None if 'https' not in self.url: self.url = 'https://' + self.url if self.url[-1] == '/': url = self.url[:-1] data = match.group().split('/') gid = data[-2] token = data[-1] session = requests.Session() session.headers.update({'User-Agent': ua.get_random_ua(), 'Referer': 'https://e-hentai.org/', 'Host': 'e-hentai.org', 'authority': 'e-hentai.org'}) session.proxies.update(config.PROXY) try: r = session.get(url) soup = BeautifulSoup(r.text, "html.parser") en_title = soup.select('#gn')[0].string jp_title = soup.select('#gj')[0].string item.titles = [] if jp_title != "": item.titles.append(jp_title) item.titles.append(en_title) tags_container = soup.select('#taglist table tbody tr') for container in tags_container: if container.select('td')[0].string == 'artist:': item.author = container.select('td')[1].select('div a').string elif container.select('td')[0].string == 'character:': pass nav_container = soup.select('div.gtb table.ptt tr td') page_num = len(nav_container) - 2 item.image_urls = [] for page_index in range(page_num): page_r = session.get(url, params={'p': page_index}) page_soup = BeautifulSoup(page_r.text, 'html.parser') thumb_images_container = page_soup.select('#gdt div[class="gdtm"]') total_images_count = len(thumb_images_container) for container in thumb_images_container: images_page_url = container.select('div a')[0].get('href') r_images_page = session.get(images_page_url) soup_images_page = BeautifulSoup(r_images_page.text, "html.parser") imgs = soup_images_page.select('.sni a img') for img in imgs: if re.search(r'/h/', img['src']): item.image_urls.append(img['src']) thread.progress = 0.15 * (len(item.image_urls) / total_images_count) print(img['src']) return item except ConnectionError as e: print(e) return None