def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None if self.page_encoding: content = content.decode('utf-8') else: content = decoder.decode(content, url) m = re.search(r'<iframe.*?src="(.*?)".*?>', content) if m: newurl = m.group(1) result = opener.open(newurl) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, newurl)) return None if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content, newurl) return content
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) if (comicImgTag is None): self.log.warn('can not find image href.') return None imgUrl = self.host + "/comic/" + comicImgTag.get('src') headers = {'Referer': url} result = opener.open(imgUrl, headers=headers) if result.status_code != 200 or opener.realurl == url: self.log.warn('can not get real comic url for : %s' % url) return None return opener.realurl
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: content = content.decode('utf-8') else: content = decoder.decode(content,url) m = re.search(r'<iframe.*?src="(.*?)".*?>', content) if m: newurl = m.group(1) result = opener.open(newurl) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,newurl)) return None if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content,newurl) return content
def get_chapter_list_from_api(self, comic_id): opener = URLOpener(addreferer=False, timeout=60) json_result = opener.open( "http://v3api.dmzj.com/comic/{comic_id}.json".format( comic_id=comic_id)) if json_result.status_code != 200 or not json_result.content: self.log.info("fetch v3 chapter list failed: %s, try v2" % json_result.status_code) json_result = opener.open( "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004" .format(comic_id=comic_id)) if json_result.status_code != 200 or not json_result.content: self.log.warn("fetch v2 chapter list failed: %s" % json_result.status_code) return [] data = json.loads(json_result.content) chapter_datas = [] for chapters_data in data["chapters"]: chapter_datas += chapters_data["data"] chapter_datas.sort(key=lambda d: d["chapter_id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id) chapters.append((chapter["chapter_title"], chapter_url)) return chapters
def get_chapter_list_from_api(self, comic_id): opener = URLOpener(addreferer=False, timeout=60) json_result = opener.open( "http://v3api.dmzj.com/comic/{comic_id}.json".format(comic_id=comic_id) ) if json_result.status_code != 200 or not json_result.content: self.log.info( "fetch v3 chapter list failed: %s, try v2" % json_result.status_code ) json_result = opener.open( "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004".format( comic_id=comic_id ) ) if json_result.status_code != 200 or not json_result.content: self.log.warn( "fetch v2 chapter list failed: %s" % json_result.status_code ) return [] data = json.loads(json_result.content) chapter_datas = [] for chapters_data in data["chapters"]: chapter_datas += chapters_data["data"] chapter_datas.sort(key=lambda d: d["chapter_id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id ) chapters.append((chapter["chapter_title"], chapter_url)) return chapters
def ParseFeedUrls(self): urls = [] #用于返回 newComicUrls = self.GetNewComic() #返回[(title, num, url),...] if not newComicUrls: return [] decoder = AutoDecoder(isfeed=False) for title, num, url in newComicUrls: opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) bodySoup = BeautifulSoup(content, 'lxml') sel = bodySoup.find('select') #页码行,要提取所有的页面 ul = sel.find_all('option') if sel else None if not ul: continue for comicPage in ul: href = comicPage.get('value') if href: pageHref = self.urljoin(url, href) result = opener.open(pageHref) if result.status_code != 200: self.log.warn('fetch comic page failed: %s' % pageHref) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) comicSrc = comicImgTag.get('src') if comicImgTag else None if comicSrc: urls.append((title, comicPage.text, comicSrc, None)) self.UpdateLastDelivered(title, num) return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("https://m.733.so"): url = url.replace('https://m.733.so', 'https://www.733.so') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find('div', {"class": "cy_plist"}) if (soup is None): self.log.warn('cy_plist is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for aindex in range(len(lias)): rindex = len(lias) - 1 - aindex href = "https://www.733.so" + lias[rindex].get("href") chapterList.append(href) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgb64 = b64encode(img.replace("http://www.baidu1.com/", "")) img_url = u'http://new.234us.com:8989/img_new.php?data={}'.format( imgb64) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(url) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})".format( url, result.status_code, result.content ) ) return [] content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) soup = BeautifulSoup(content, "html.parser") scripts = soup.findAll("script", {"type": "text/javascript"}) packed_js = None for script in scripts: if "newImgs" in script.text: packed_js = script.text break if not packed_js: self.log.warn("Can't find js") return [] codes = decode_packed_codes(packed_js) return re.findall("'(.+?)'", codes)
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host) chapterList = [] url = url.replace("http://www.dm5.com", "https://www.manhuaren.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})".format( url, result.status_code, result.content ) ) return chapterList content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for link in soup.find_all("a", {"class": "chapteritem"}): chapter_datas.append( { "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)), "chapter_title": unicode(link.string), } ) chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "http://www.manhuaren.com/m{}/".format(chapter["chapter_id"]) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目 items=soup.find('div',class_='grid').find_all(name='div', class_='content') # 循环处理每个文章条目 for item in items: title = item.span.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin(url, link) # 合成文章链接 if self.OutTimeRange(item): continue urls.append((topic, title, link, None)) # 把文章元组加入列表 # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def getChapterList(self, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id) result = opener.open(getChapterListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) contentJson = json.loads(content) count = contentJson.get('length', 0) if (count != 0): for i in range(count + 1): for item in contentJson: if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i: chapterList.append({item: contentJson[item]}) break else: self.log.warn('comic count is zero.') return chapterList
def getImgList(self, chapterJson, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] cid = list(chapterJson.keys())[0] getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid) result = opener.open(getImgListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) if len(filter_result) != 0: base64data = filter_result[0][1:] img_detail_json = json.loads(base64.decodestring(base64data)) for img_url in img_detail_json.get('picture', []): if ( 'url' in img_url ): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("http://"): url = url.replace('http://', 'https://') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') allComicTable = soup.find_all('table', { 'width': '800', 'align': 'center' }) for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: href = self.urljoin(self.host, volume.get('href')) chapterList.append(href) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = set() # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed not in urladded: desc = None if isfulltext: if hasattr(e, 'content') and e.content[0].value: desc = e.content[0].value elif hasattr(e, 'summary'): desc = e.summary else: self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://cctv.cntv.cn/lm/jiaodianfangtan/index.shtml' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) for li in soup.find_all('div', attrs={'class':'text'}): a=li.find('a') href = a['href'] try: pubdate = datetime.datetime.strptime(file_name_search(href).group(0), '%Y/%m/%d') except Exception as e: continue delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'焦点访谈',a.string,href,None)) return urls
def get_chapter_list_from_mobile_url(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) if "obj_id" not in content: self.log.warn(u"Can't find obj_id form {}".format(url)) return [] comic_id = re.search('obj_id = "(\d+)"', content).group(1) data_match = re.search("initIntroData\(([^;]+)\);", content) if not data_match: return self.get_chapter_list_from_api(comic_id) datas = json.loads(data_match.group(1)) chapter_datas = [] for data in datas: chapter_datas += data["data"] if not chapter_datas: return self.get_chapter_list_from_api(comic_id) chapter_datas.sort(key=lambda d: d["id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["id"], comic_id=comic_id) chapters.append((chapter["chapter_name"], chapter_url)) return chapters
def POST(self): user = self.getcurrentuser(forAjax=True) web.header('Content-Type', 'application/json') webInput = web.input() category = webInput.get('category', '') title = webInput.get('title') feedUrl = webInput.get("url") isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true') creator = webInput.get('creator', '') if not title or not feedUrl: return json.dumps({'status': _("Title or Url is empty!")}) opener = URLOpener() srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', SharedLibrarykindleearAppspotCom.__url__) data = { 'category': category, 'title': title, 'url': feedUrl, 'creator': creator, 'isfulltext': 'true' if isfulltext else 'false', 'key': 'kindleear.lucky!' } result = opener.open(srvUrl, data) if result.status_code == 200 and result.content: return result.content else: return json.dumps({ 'status': _('Cannot submit data to kindleear.appspot.com, status: %s' % URLOpener.CodeMap(result.status_code)) })
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] urladded = set() url = self.url4forwarder(self.feeds[0][1]) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition,section in self.partitions: for item in feed[partition]: urlfeed = item['share_url'] if urlfeed in urladded: self.log.info('duplicated, skipped %s' % urlfeed) continue urls.append((section, item['title'], self.url4forwarder(urlfeed), None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code, url)) return urls #def fetcharticle(self, url, opener, decoder): # result = opener.open(self.url4forwarder(url)) # status_code, content = result.status_code, result.content # if status_code != 200 or not content: # self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) # return None # # if self.page_encoding: # return content.decode(self.page_encoding) # else: # return decoder.decode(content,url,result.headers)
def ParseFeedUrls(self): """ return list like [(section,title,url),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = [] # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: url = e.link if url not in urladded: if isfulltext: desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?')) else: urls.append((section, e.title, url, None)) urladded.append(url) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://m.733.so" ): url = url.replace('https://m.733.so', 'https://www.733.so') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find('div', {"class": "cy_plist"}) if (soup is None): self.log.warn('cy_plist is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://www.733.so" + lias[rindex].get("href") chapterList.append((lias[rindex].get_text(), href)) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(url) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") scripts = soup.findAll("script", {"type": "text/javascript"}) packed_js = None for script in scripts: if "newImgs" in script.text: packed_js = script.text break if not packed_js: self.log.warn("Can't find js") return [] codes = decode_packed_codes(packed_js) return re.findall("'(.+?)'", codes)
def SaveToInstapaper(self, user, action, orgUrl): web.header('Content-type', "text/html; charset=utf-8") T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>""" if not user.instapaper_username or not user.instapaper_password: info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.') return info.encode('utf-8') title = web.input().get('t', '') name = web.input().get("n", '') if user.instapaper_username != name: info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.') return info.encode('utf-8') opener = URLOpener() password = ke_decrypt(user.instapaper_password, user.secret_key or '') apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 'selection':'KindleEar', 'url':orgUrl} ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters) if ret.status_code in (200, 201): info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Saved to Instapaper', info) elif ret.status_code == 403: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) else: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code) info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) return info.encode('utf-8')
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://cctv.cntv.cn/lm/xinwenyijiayi/video/index.shtml' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) list_pattern=re.compile(r'{\'title\':\'.*?\'<!--VIDEOSTR-->\'}', re.S) file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search l=re.findall(list_pattern,content) tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) for i in l[:5]: item=eval(i) try: pubdate = datetime.datetime.strptime(file_name_search(item["link_add"]).group(0), '%Y/%m/%d') except Exception as e: continue delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'新闻1+1',item['title'],item['link_add'],None)) return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] url = url.replace("https://m.tohomh123.com", "https://www.tohomh123.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find("ul", {"id": 'detail-list-select-2'}) if not soup: self.log.warn('chapterList is not exist.') return chapterList lias = soup.findAll('a') if not lias: self.log.warn('chapterList href is not exist.') return chapterList for a in lias: href = "https://www.tohomh123.com" + a.get("href") chapterList.append((unicode(a.contents[0]), href)) return chapterList
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I) mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) comment_json = mt.group(1) if mt else None j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
def getImgUrlList(self, url): imgUrlList = [] decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return None ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return None for ul in ulist: if ul.get('value') == None: ulist.remove(ul) else: href = self.host + '/comic/' + ul.get('value') imgUrlList.append(href) return imgUrlList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://opinion.people.com.cn/GB/40604/index.html' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') box=soup.find('div', attrs={'class':'p2j_list'}) for li in box.find_all('li'): a=li.find('a') # print a['href'],a.string title = a.string if u'人民日报' in title: urls.append((u'人民日报',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None)) else: urls.append((u'人民网',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] urladded = set() url = self.url4forwarder( 'http://news.at.zhihu.com/api/1.1/news/latest') opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition in self.partitions: for item in feed[partition]: for e in item['items']: urlfeed = e['share_url'] if urlfeed in urladded: self.log.warn('skipped %s' % urlfeed) continue urls.append((self.partitions[partition], e['title'], urlfeed, None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.status_code, url)) #self.log.warn('%s' % json.dumps(urls)) return urls
def SaveToInstapaper(self, user, action, orgUrl): web.header('Content-type', "text/html; charset=utf-8") T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>""" if not user.instapaper_username or not user.instapaper_password: info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.') return info.encode('utf-8') title = web.input().get('t', '') name = web.input().get("n", '') if user.instapaper_username != name: info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.') return info.encode('utf-8') opener = URLOpener() password = ke_decrypt(user.instapaper_password, user.secret_key or '') apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 'selection':'KindleEar', 'url':orgUrl} ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters) if ret.status_code in (200, 201): info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Saved to Instapaper', info) elif ret.status_code == 403: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) else: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code) info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) return info.encode('utf-8')
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] urladded = set() url = self.url4forwarder(self.feeds[0][1]) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition,section in self.partitions: for item in feed[partition]: urlfeed = item['share_url'] if urlfeed in urladded: self.log.info('duplicated, skipped %s' % urlfeed) continue urls.append((section, item['title'], self.url4forwarder(urlfeed), None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls #def fetcharticle(self, url, opener, decoder): # result = opener.open(self.url4forwarder(url)) # status_code, content = result.status_code, result.content # if status_code != 200 or not content: # self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code),url)) # return None # # if self.page_encoding: # return content.decode(self.page_encoding) # else: # return decoder.decode(content,url,result.headers)
def POST(self, verType): INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate" web.header('Content-Type', 'application/json') respDict = {'status':'ok', 'correct':0} if verType.lower() != 'instapaper': respDict['status'] = _('Request type[%s] unsupported') % verType return json.dumps(respDict) user = self.getcurrentuser() username = web.input().get('username', '') password = web.input().get('password', '') opener = URLOpener() apiParameters = {'username': username, 'password':password} ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters) if ret.status_code in (200, 201): respDict['correct'] = 1 elif ret.status_code == 403: respDict['correct'] = 0 else: respDict['status'] = _("The Instapaper service encountered an error. Please try again later.") return json.dumps(respDict)
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://www.manhuagui.com" ): url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') invisible_input = soup.find("input", {"id":'__VIEWSTATE'}) if invisible_input: lz_encoded=invisible_input.get("value") lz_decoded = decompressFromBase64(lz_encoded) soup = BeautifulSoup(lz_decoded, 'html.parser') else: soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'}) lias = soup.findAll('a') for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://m.manhuagui.com" + lias[rindex].get("href") chapterList.append(href) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if script.text != "": raw_content = script.text break res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) cid = self.getChapterId(url) md5 = pages_opts["sl"]["md5"] images = pages_opts["images"] for img in images: img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5) imgList.append(img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host) chapterList = [] url = url.replace("http://www.dm5.com", "https://www.manhuaren.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for link in soup.find_all("a", {"class": "chapteritem"}): chapter_datas.append({ "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "http://www.manhuaren.com/m{}/".format( chapter["chapter_id"]) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] partition = feed[2] timeout = 30 opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for item in feed[partition]: for e in item['items']: #支持HTTPS urlfeed = e['share_url'].replace('http://','https://') if url.startswith('https://') else e['share_url'] if urlfeed in urladded: self.log.warn('skipped %s' % urlfeed) continue desc = None urls.append((section, e['title'], urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) #self.log.warn('%s' % json.dumps(urls)) return urls
def getImgUrlList(self, url): imgUrlList = [] decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return None ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return None for ul in ulist: if ul.get('value') == None: ulist.remove(ul) else: href = self.host + '/comic/' + ul.get('value') imgUrlList.append(href) return imgUrlList
def get_chapter_list_from_mobile_url(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return [] content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) if "obj_id" not in content: self.log.warn(u"Can't find obj_id form {}".format(url)) return [] comic_id = re.search('obj_id = "(\d+)"', content).group(1) data_match = re.search("initIntroData\(([^;]+)\);", content) if not data_match: return self.get_chapter_list_from_api(comic_id) datas = json.loads(data_match.group(1)) chapter_datas = [] for data in datas: chapter_datas += data["data"] if not chapter_datas: return self.get_chapter_list_from_api(comic_id) chapter_datas.sort(key=lambda d: d["id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["id"], comic_id=comic_id ) chapters.append((chapter["chapter_name"], chapter_url)) return chapters
def BaiduPanHandler(url): import json o = urlparse.urlparse(url) if not o[1] or not o[1].endswith(('pan.baidu.com','yun.baidu.com')): return None #为了简单起见,这里使用网友制作的网站获取真实链接 #后续为了减少依赖,可以借鉴 #https://github.com/banbanchs/pan-baidu-download #和 https://github.com/xuanqinanhai/bleed-baidu-white #将代码集成过来 url = 'http://daimajia.duapp.com/baidu/?url=%s' % url opener = URLOpener() result = opener.open(url) if result.status_code != 200 or not result.content: return None linkinfo = json.loads(result.content.decode('utf-8')) filename = linkinfo.get('name') if '\u' in filename: try: filename = filename.decode('unicode-escape') except: pass link = linkinfo.get('download') return (filename,link) if link else None
def POST(self, verType): INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate" web.header('Content-Type', 'application/json') respDict = {'status':'ok', 'correct':0} if verType.lower() != 'instapaper': respDict['status'] = _('Request type[%s] unsupported') % verType return json.dumps(respDict) user = self.getcurrentuser() username = web.input().get('username', '') password = web.input().get('password', '') opener = URLOpener() apiParameters = {'username': username, 'password':password} ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters) if ret.status_code in (200, 201): respDict['correct'] = 1 elif ret.status_code == 403: respDict['correct'] = 0 else: respDict['status'] = _("The Instapaper service encountered an error. Please try again later.") return json.dumps(respDict)
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "http://" ): url = url.replace('http://', 'https://') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') allComicTable = soup.find_all('table', {'width': '800', 'align': 'center'}) if (allComicTable is None): self.log.warn('allComicTable is not exist.') return chapterList for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) if (comicVolumes is None): self.log.warn('comicVolumes is not exist.') return chapterList for volume in comicVolumes: href = self.urljoin(self.host, volume.get("href")) chapterList.append((unicode(volume.string), href)) return chapterList
def fetcharticle(self, url, decoder): """链接网页获取一篇文章""" if self.fulltext_by_instapaper and not self.fulltext_by_readability: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None if 0: #有些网站封锁GAE,将GAE获取的网页发送到自己邮箱调试 from google.appengine.api import mail mail.send_mail(SRC_EMAIL, SRC_EMAIL, "KindleEar Debug", "KindlerEar", attachments=[ ("Page.html", content), ]) if self.page_encoding: try: return content.decode(self.page_encoding) except UnicodeDecodeError: return decoder.decode(content, opener.realurl) else: return decoder.decode(content, opener.realurl)
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I) comment = '' mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) if mt: comment_json = mt.group(1) j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'qTcms_S_m_murl_e="(.*)";', content).group() if (res is None): self.log.warn(content) self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgList.append(img) return imgList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1].replace('gzh', 'gzhjs') isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) content = content[content.index('{'):content.index('}')+1] content = json.loads(content) for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400*self.oldest_article #以天为单位 if delta.days*86400+delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://','https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"mh" in urlpaths) and (urlpaths.index(u"mh") + 2 < len(urlpaths))): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh") + 1] pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format( tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def SendNewSubscription(self, title, url): opener = URLOpener() path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/') path[-1] = 'subscribedfromshared' srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', '/'.join(path)) data = {'title': title, 'url': url} result = opener.open(srvUrl, data) #只管杀不管埋,不用管能否成功了
def ParseFeedUrls(self): urls = [] #用于返回 newComicUrls = self.GetNewComic() #返回[(title, num, url),...] if not newComicUrls: return [] decoder = AutoDecoder(isfeed=False) for title, num, url in newComicUrls: opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) bodySoup = BeautifulSoup(content, 'lxml') sel = bodySoup.find('select') #页码行,要提取所有的页面 ul = sel.find_all('option') if sel else None if not ul: continue for comicPage in ul: href = comicPage.get('value') if href: pageHref = self.urljoin(url, href) result = opener.open(pageHref) if result.status_code != 200: self.log.warn('fetch comic page failed: %s' % pageHref) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) comicSrc = comicImgTag.get('src') if comicImgTag else None if comicSrc: urls.append((title, comicPage.text, comicSrc, None)) self.UpdateLastDelivered(title, num) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,url) else: content = AutoDecoder(True).decode(result.content,url) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: updated = None if hasattr(e, 'updated_parsed') and e.updated_parsed: updated = e.updated_parsed elif hasattr(e, 'published_parsed') and e.published_parsed: updated = e.published_parsed elif hasattr(e, 'created_parsed'): updated = e.created_parsed if self.oldest_article > 0 and updated: delta = tnow - datetime.datetime(*(updated[0:6])) if delta.days*86400+delta.seconds > 86400*self.oldest_article: self.log.info("Skip old article: %s" % e.link) continue #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'description'): desc = e.description elif hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] else: self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def GetNewComic(self): urls = [] if not self.feeds: return [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) for item in self.feeds: title, url = item[0], item[1] lastCount = LastDelivered.all().filter( 'username = '******'These is no log in db LastDelivered for name: %s, set to 0' % title) oldNum = 0 else: oldNum = lastCount.num opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200: self.log.warn( 'fetch index page for %s failed[%s] : %s' % (title, URLOpener.CodeMap(result.status_code), url)) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') allComicTable = soup.find_all('table', {'width': '688'}) addedForThisComic = False for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: texts = volume.text.split(' ') if len(texts) > 2 and texts[1].isdigit() and volume.get( 'href'): num = int(texts[1]) if num > oldNum: oldNum = num href = self.urljoin(self.host, volume.get('href')) urls.append((title, num, href)) addedForThisComic = True break #一次只推送一卷(有时候一卷已经很多图片了) if addedForThisComic: break return urls
class Pocket(object): def __init__(self, consumer_key, redirect_uri=None): self.consumer_key = str(consumer_key) self.redirect_uri = redirect_uri self.access_token = None self.opener = URLOpener(headers=POCKET_HEADERS) def _post(self, method_url, **kw): ret = self.opener.open(method_url, data=json.dumps(kw)) if ret.status_code != 200 or not ret.content: raise APIError( ret.status_code, ret.headers.get("X-Error-Code", ""), ret.headers.get("X-Error", ""), "Get access token" ) return json.loads(ret.content) def _authenticated_post(self, method_url, **kw): kw["consumer_key"] = self.consumer_key kw["access_token"] = self.access_token return self._post(method_url, **kw) def get_request_token(self): # 此步仅用来直接通过一次http获取一个request_token(code),pocket不会回调redirect_uri ret = self._post(REQUEST_TOKEN_URL, consumer_key=self.consumer_key, redirect_uri=self.redirect_uri) return ret.get("code", "") def get_authorize_url(self, code): if not self.redirect_uri: raise APIError(400, "140", "Missing redirect url.", "Get access token") url = AUTH_TOKEN_URL % {"request_token": code, "redirect_uri": self.redirect_uri} return url def get_access_token(self, code): # access token : {"access_token":"dcba4321-dcba-4321-dcba-4321dc","username":"******"}. ret = self._post(ACCESS_TOKEN_URL, consumer_key=self.consumer_key, code=code) self.access_token = ret.get("access_token", "") return ret def set_access_token(self, access_token): self.access_token = str(access_token) def add(self, **kw): # 需要的参数: # url : 要保存的URL,最好先urlencode # title : 可选,如果保存的是一个图片或PDF,则需要,否则不需要 # tags : 可选,逗号分隔的字符串列表 # tweet_id : 可选,用于发推的ID # 返回一个字典,包含的键可能有:https://getpocket.com/developer/docs/v3/add return self._authenticated_post("https://getpocket.com/v3/add", **kw) def get(self, **kw): return self._authenticated_post("https://getpocket.com/v3/get", **kw) def modify(self, **kw): pass
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ( (u"mh" in urlpaths) and (urlpaths.index(u"mh")+2 < len(urlpaths)) ): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh")+1] pid = urlpaths[urlpaths.index(u"mh")+2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def get_image_list_from_api(self, url): comic_id, chapter_id = re.search(r"(\d+)/(\d+)\.html", url).groups() opener = URLOpener(addreferer=False, timeout=60) result = opener.open( "http://v3api.dmzj.com/chapter/{comic_id}/{chapter_id}.json".format( comic_id=comic_id, chapter_id=chapter_id ) ) if result.status_code != 200: self.log.info("fetch v3 api json failed: %s, try v2" % result.status_code) result = opener.open( "http://v2.api.dmzj.com/chapter/{comic_id}/{chapter_id}.json?channel=Android&version=2.6.004".format( comic_id=comic_id, chapter_id=chapter_id ) ) if result.status_code != 200: self.log.warn("fetch v2 api json failed: %s" % result.status_code) return [] data = json.loads(result.content) return data["page_url"]
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(self.url4forwarder(url)) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content,url)