def ParseFeedUrls(self): main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, "lxml") for section in soup.find_all('dl'): dt=section.find('dt') span=dt.find('span') if span: sectitle = string_of_tag(span).strip() for dd in section.find_all('dd'): a=dd.find('a', href=True) title = string_of_tag(a).strip() url = a['href'] if url.startswith('Article'): url = 'http://bbstsg.vip.qikan.com/text/'+url urls.append((sectitle,title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(url) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") scripts = soup.findAll("script", {"type": "text/javascript"}) packed_js = None for script in scripts: if "newImgs" in script.text: packed_js = script.text break if not packed_js: self.log.warn("Can't find js") return [] codes = decode_packed_codes(packed_js) return re.findall("'(.+?)'", codes)
def getChapterList(self, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id) result = opener.open(getChapterListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) contentJson = json.loads(content) count = contentJson.get('length', 0) if (count != 0): for i in range(count + 1): for item in contentJson: if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i: chapterList.append({item: contentJson[item]}) break else: self.log.warn('comic count is zero.') return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgb64 = b64encode(img.replace("http://www.baidu1.com/", "")) img_url = u'http://new.234us.com:8989/img_new.php?data={}'.format( imgb64) imgList.append(img_url) return imgList
def getImgList(self, chapterJson, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] cid = list(chapterJson.keys())[0] getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid) result = opener.open(getImgListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) if len(filter_result) != 0: base64data = filter_result[0][1:] img_detail_json = json.loads(base64.decodestring(base64data)) for img_url in img_detail_json.get('picture', []): if ( 'url' in img_url ): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if script.text != "": raw_content = script.text break res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) cid = self.getChapterId(url) md5 = pages_opts["sl"]["md5"] images = pages_opts["images"] for img in images: img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5) imgList.append(img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] url = url.replace("https://m.tohomh123.com", "https://www.tohomh123.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find("ul", {"id": 'detail-list-select-2'}) if not soup: self.log.warn('chapterList is not exist.') return chapterList lias = soup.findAll('a') if not lias: self.log.warn('chapterList href is not exist.') return chapterList for a in lias: href = "https://www.tohomh123.com" + a.get("href") chapterList.append((unicode(a.contents[0]), href)) return chapterList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("https://m.733.so"): url = url.replace('https://m.733.so', 'https://www.733.so') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find('div', {"class": "cy_plist"}) if (soup is None): self.log.warn('cy_plist is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for aindex in range(len(lias)): rindex = len(lias) - 1 - aindex href = "https://www.733.so" + lias[rindex].get("href") chapterList.append(href) return chapterList
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout) decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 prevsection = section opener = URLOpener(self.host, timeout=self.timeout) if self.needs_subscription: self.login(opener, decoder) article = self.fetcharticle(url, opener, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief, thumbnail in readability( article, url, opts, user): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)
def getImgUrlList(self, url): imgUrlList = [] decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return None ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return None for ul in ulist: if ul.get('value') == None: ulist.remove(ul) else: href = self.host + '/comic/' + ul.get('value') imgUrlList.append(href) return imgUrlList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host) chapterList = [] url = url.replace("http://www.dm5.com", "https://www.manhuaren.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for link in soup.find_all("a", {"class": "chapteritem"}): chapter_datas.append({ "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "http://www.manhuaren.com/m{}/".format( chapter["chapter_id"]) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'qTcms_S_m_murl_e="(.*)";', content).group() if (res is None): self.log.warn(content) self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgList.append(img) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://www.manhuagui.com" ): url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') invisible_input = soup.find("input", {"id":'__VIEWSTATE'}) if invisible_input: lz_encoded=invisible_input.get("value") lz_decoded = decompressFromBase64(lz_encoded) soup = BeautifulSoup(lz_decoded, 'html.parser') else: soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'}) lias = soup.findAll('a') for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://m.manhuagui.com" + lias[rindex].get("href") chapterList.append(href) return chapterList
def get_chapter_list_from_mobile_url(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) if "obj_id" not in content: self.log.warn(u"Can't find obj_id form {}".format(url)) return [] comic_id = re.search('obj_id = "(\d+)"', content).group(1) data_match = re.search("initIntroData\(([^;]+)\);", content) if not data_match: return self.get_chapter_list_from_api(comic_id) datas = json.loads(data_match.group(1)) chapter_datas = [] for data in datas: chapter_datas += data["data"] if not chapter_datas: return self.get_chapter_list_from_api(comic_id) chapter_datas.sort(key=lambda d: d["id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["id"], comic_id=comic_id) chapters.append((chapter["chapter_name"], chapter_url)) return chapters
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) if (comicImgTag is None): self.log.warn('can not find image href.') return None imgUrl = self.host + "/comic/" + comicImgTag.get('src') headers = {'Referer': url} result = opener.open(imgUrl, headers=headers) if result.status_code != 200 or opener.realurl == url: self.log.warn('can not get real comic url for : %s' % url) return None return opener.realurl
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("http://"): url = url.replace('http://', 'https://') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') allComicTable = soup.find_all('table', { 'width': '800', 'align': 'center' }) for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: href = self.urljoin(self.host, volume.get('href')) chapterList.append(href) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"mh" in urlpaths) and (urlpaths.index(u"mh") + 2 < len(urlpaths))): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh") + 1] pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format( tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def GetNewComic(self): urls = [] if not self.feeds: return [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) for item in self.feeds: title, url = item[0], item[1] lastCount = LastDelivered.all().filter( 'username = '******'These is no log in db LastDelivered for name: %s, set to 0' % title) oldNum = 0 else: oldNum = lastCount.num opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200: self.log.warn( 'fetch index page for %s failed[%s] : %s' % (title, URLOpener.CodeMap(result.status_code), url)) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') allComicTable = soup.find_all('table', {'width': '688'}) addedForThisComic = False for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: texts = volume.text.split(' ') if len(texts) > 2 and texts[1].isdigit() and volume.get( 'href'): num = int(texts[1]) if num > oldNum: oldNum = num href = self.urljoin(self.host, volume.get('href')) urls.append((title, num, href)) addedForThisComic = True break #一次只推送一卷(有时候一卷已经很多图片了) if addedForThisComic: break return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"id" in urlpaths) and (urlpaths.index(u"id") + 1 < len(urlpaths))): comic_id = urlpaths[urlpaths.index(u"id") + 1] if ((not comic_id.isdigit()) or (comic_id == "")): self.log.warn('can not get comic id: %s' % url) return chapterList url = 'https://m.ac.qq.com/comic/chapterList/id/{}'.format(comic_id) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') # <section class="chapter-list-box list-expanded" data-vip-free="1"> section = soup.find('section', {'class': 'chapter-list-box list-expanded'}) if (section is None): self.log.warn('chapter-list-box is not exist.') return chapterList # <ul class="chapter-list normal"> # <ul class="chapter-list reverse"> reverse_list = section.find('ul', {'class': 'chapter-list reverse'}) if (reverse_list is None): self.log.warn('chapter-list is not exist.') return chapterList for item in reverse_list.find_all('a'): # <a class="chapter-link lock" data-cid="447" data-seq="360" href="/chapter/index/id/531490/cid/447">360</a> # https://m.ac.qq.com/chapter/index/id/511915/cid/1 href = 'https://m.ac.qq.com' + item.get('href') isVip = "lock" in item.get('class') if isVip == True: self.log.info( "Chapter {} is Vip, waiting for free.".format(href)) continue chapterList.append((item.get_text(), href)) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] i = 0 for feed in self.feeds: feedtitle, url = feed[0], feed[1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) continue if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode( result.content, opener.realurl, result.headers) else: content = AutoDecoder(False).decode(result.content, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') for article in soup.findAll('div', {"class": "text"}): if article.find("h2") and article.find("a"): title = article.a.contents[0].strip() if not title: continue href = self.url_prefix + article.a['href'] urls.append((feedtitle, title, href, None)) if i > 3: break else: i = i + 1 return urls
def ParseFeedUrls(self): urls = [] #用于返回 newComicUrls = self.GetNewComic() #返回[(title, num, url),...] if not newComicUrls: return [] decoder = AutoDecoder(isfeed=False) for title, num, url in newComicUrls: opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) bodySoup = BeautifulSoup(content, 'lxml') sel = bodySoup.find('select') #页码行,要提取所有的页面 ul = sel.find_all('option') if sel else None if not ul: continue for comicPage in ul: href = comicPage.get('value') if href: pageHref = self.urljoin(url, href) result = opener.open(pageHref) if result.status_code != 200: self.log.warn('fetch comic page failed: %s' % pageHref) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) comicSrc = comicImgTag.get('src') if comicImgTag else None if comicSrc: urls.append((title, comicPage.text, comicSrc, None)) self.UpdateLastDelivered(title, num) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) # "picture": [{},...{}]} if len(filter_result) != 0: # "picture" > InBpY3R1cmUi # picture": > cGljdHVyZSI6 # icture":[ > aWN0dXJlIjpb if "InBpY3R1cmUi" in filter_result[0]: base64data = filter_result[0].split("InBpY3R1cmUi")[1] self.log.warn('found flag string: %s' % "InBpY3R1cmUi") elif "cGljdHVyZSI6" in filter_result[0]: base64data = filter_result[0].split("cGljdHVyZSI6")[1] self.log.warn('found flag string: %s' % "cGljdHVyZSI6") elif "aWN0dXJlIjpb" in filter_result[0]: base64data = filter_result[0].split("aWN0dXJl")[1] self.log.warn('found flag string: %s' % "aWN0dXJlIjpb") else: self.log.warn('can not found flag string in data: %s' % filter_result[0]) return imgList decodeData = base64.decodestring(base64data) startIndex = decodeData.find('[') endIndex = decodeData.find(']') if startIndex > -1 and endIndex > -1: img_detail_json = json.loads(decodeData[startIndex:endIndex + 1]) for img_url in img_detail_json: if ('url' in img_url): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) else: self.log.warn('can not found [] in decodeData:%s' % decodeData) else: self.log.warn('can not fount filter_result with data: .') return imgList
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) url = self.host + "/comic/" + url result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) return comicImgTag.get('src') if comicImgTag else None
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) #var chapterPath = "images/comic/31/61188/"; chapterPath = re.search( r'(var chapterPath = ")(.*)(";var chapterPrice)', content) if (chapterPath is None): self.log.warn('var chapterPath is not exist.') return imgList else: chapterPath = chapterPath.group(2) #var pageImage = "https://res.gufengmh.com/gufeng/images/"; imgPrefix = re.search(r'(var pageImage = ")(.*)(gufeng/images/)', content) if (imgPrefix is None): self.log.warn( '"https://res.gufengmh.com/gufeng/images/ is not exist.') return imgList else: imgPrefix = imgPrefix.group(2) + "/" #var chapterImages = ["",""]; images = re.search(r'(var chapterImages = \[)(.*)(\];)', content) if (images is None): self.log.warn('var chapterImages is not exist.') return imgList else: images = images.group(2).split(',') for img in images: img_url = imgPrefix + chapterPath + img.replace("\"", "") imgList.append(img_url) return imgList
def getChapterList(self, url): if url.startswith("https://m.dmzj.com"): return self.get_chapter_list_from_mobile_url(url) decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) chapterList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) comic_id = re.search('g_comic_id = "([^"]+)', content).group(1) # try get chapters from html soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for comic_classname in [ "cartoon_online_border", "cartoon_online_border_other" ]: divs = soup.find_all("div", attrs={"class": comic_classname}) if not divs: continue for div in divs: for link in div.find_all("a"): chapter_datas.append({ "chapter_id": int( re.search("\/(\d+)\.shtml", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) if chapter_datas: chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList else: return self.get_chapter_list_from_api(comic_id)
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return imgList ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return imgList for ul in ulist: if ul.get('value') == None: ulist.remove(ul) listLen = len(ulist) firstPageTag = soup.find('img', {'oncontextmenu': 'return false'}) firstPage = firstPageTag.get('src') if firstPageTag else None if firstPage != None: firstPage = "https://www.cartoonmad.com/{}".format(firstPage) base, length, type = self.getImgStr(firstPage) for index in range(len(ulist)): imgUrl = "{}{}.{}".format(base, str(index+1).zfill(length), type) imgList.append(imgUrl) if imgList[0] == firstPage and imgList[listLen-1] == self.getImgUrl(ulist[listLen-1].get('value')): return imgList else: imgList = [] for ul in ulist: imgList.append("https://www.cartoonmad.com/{}".format(self.getImgUrl(ul.get('value')))) return imgList return imgList
def ParseFeedUrls(self): urls = [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) lastCount = LastDelivered.all().filter('username = '******'' else: oldNum = lastCount.num oldChapterTitle = lastCount.record opener = URLOpener(self.host, timeout=60) result = opener.open(self.feeds) if result.status_code != 200: self.log.warn('fetch index page for %s failed[%s] : %s' % (self.title, URLOpener.CodeMap( result.status_code), self.feeds)) return [] # 从页面获取章节列表 content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') chapterList = self.GetChapterList(soup) chapterNum = 0 for chapter in chapterList: if chapterNum >= self.limit: break url = chapter.get('href') num = self.GetChapterNum(url) if num > oldNum: oldNum = num oldChapterTitle = chapter.text chapterNum += 1 urls.append( (self.title, oldChapterTitle, self.urljoin(self.host, url), '')) self.UpdateLastDelivered(self.title, oldNum, oldChapterTitle) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if "window[\"\\x65\\x76\\x61\\x6c\"]" in script.text != "": raw_content = script.text break if (raw_content is None): self.log.warn('raw_content href is not exist.') return imgList res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) # cid = self.getChapterId(url) m = pages_opts["sl"]["m"] e = pages_opts["sl"]["e"] images = pages_opts["images"] if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: # https://i.hamreus.com/ps3/p/pingxingtt_gbl/%E7%AC%AC117%E8%AF%9D/1_7684.jpg.webp?e=1769209619&m=MOn_QAAi-qwQBaRjlmNYkA img_url = u'https://i.hamreus.com{}?e={}&m={}'.format(img, e, m) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) try: # function base64decode(str){*}; func = re.search(r'function\ base64decode\(str\){.*};', content).group() func = func.split('base64decode')[1].replace('};', '}') # packed="*"; packed = re.search(r'packed=".*";', content).group() packed = packed.split('\"')[1] except: self.log.warn('var photosr is not exist.') return imgList # eval(function(str){*}("*").slice(4)) lz_input = "eval(function{}(\"{}\").slice(4))".format(func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList # photosr[1]="images/2019/11/08/09/19904f5d64.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; images = lz_nodejs.split("\"") # http://res.img.220012.net/2017/08/22/13/343135d67f.jpg for img in images: if ".jpg" in img: img_url = self.urljoin("http://res.img.220012.net", img) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') try: func = re.search(r'function\ base64decode\(str\){.*};', content).group() packed = re.search(r'packed=".*";', content).group() except: self.log.warn('var photosr is not exist in {}.'.format(url)) return imgList # eval(function(str){*}("*").slice(4)) lz_input = "{}var photosr = new Array();{}console.log(eval(base64decode(packed).slice(4)));".format( func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList images = lz_nodejs.split("\"") self.log.info(images) for img in images: # photosr[1]="images/2020/05/03/17/516bbfddb4.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; # http://res.img.fffimage.com/images/2020/05/03/17/516bbfddb4.jpg/0 # photosr[1]="images/2020/04/21/09/3706a024c8.png/0";...photosr[12]="images/2020/04/21/09/3732355905.png/0"; # http://res.img.fffimage.com/images/2020/04/21/09/3706a024c8.png/0 if ".jpg" in img or ".png" in img: img_url = self.urljoin("http://res.img.fffimage.com/", img) imgList.append(img_url) return imgList