示例#1
0
def BaiduPanHandler(url):
    import json
    o = urlparse.urlparse(url)
    if not o[1] or not o[1].endswith(('pan.baidu.com','yun.baidu.com')):
        return None
    
    #为了简单起见,这里使用网友制作的网站获取真实链接
    #后续为了减少依赖,可以借鉴
    #https://github.com/banbanchs/pan-baidu-download
    #和 https://github.com/xuanqinanhai/bleed-baidu-white
    #将代码集成过来
    url = 'http://daimajia.duapp.com/baidu/?url=%s' % url
    opener = URLOpener()
    result = opener.open(url)
    if result.status_code != 200 or not result.content:
        return None
    linkinfo = json.loads(result.content.decode('utf-8'))
    filename = linkinfo.get('name')
    if '\u' in filename:
        try:
            filename = filename.decode('unicode-escape')
        except:
            pass
    link = linkinfo.get('download')
    
    return (filename,link) if link else None
示例#2
0
 def ParseFeedUrls(self):
     urls = [] # 定义一个空的列表用来存放文章元组
     # 循环处理fees中两个主题页面
     for feed in self.feeds:
         # 分别获取元组中主题的名称和链接
         topic, url = feed[0], feed[1]
         # 请求主题链接并获取相应内容
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url)
         # 如果请求成功,并且页面内容不为空
         if result.status_code == 200 and result.content:
             # 将页面内容转换成BeatifulSoup对象
             soup = BeautifulSoup(result.content, 'lxml')
             # 找出当前页面文章列表中所有文章条目
             items = soup.find_all(name='div', id='list')
             # 循环处理每个文章条目
             for item in items:
                 title = item.a.string # 获取文章标题
                 link = item.a.get('href') # 获取文章链接
                 link = BaseFeedBook.urljoin(url, link) # 合成文章链接
                 urls.append((topic, title, link, None)) # 把文章元组加入列表
         # 如果请求失败通知到日志输出中
         else:
             self.log.warn('Fetch article failed(%s):%s' % \
                 (URLOpener.CodeMap(result.status_code), url))
     # 返回提取到的所有文章列表
     return urls
示例#3
0
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在纽约时报有.*?条讨论,点击查看。</a>', re.I)
        comment = ''
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        comment = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        if mt:
            comment_json = mt.group(1)
            j = json.loads(comment_json)
            soup = BeautifulSoup(content, "lxml")
            for c in j['comments']:
                u = c['user']['screen_name']
                t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
                for img in t.find_all('img', alt=True):
                    img.replace_with(t.new_string(img['alt']))
                soup.html.body.append(t.p)

            content = unicode(soup)
        return content
示例#4
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         content = content.decode('utf-8')
     else:
         content = decoder.decode(content,url)
     
     m = re.search(r'<iframe.*?src="(.*?)".*?>', content)
     if m:
         newurl = m.group(1)
         result = opener.open(newurl)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             self.log.warn('fetch article failed(%d):%s.' % (status_code,newurl))
             return None
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content,newurl)
     
     return content
示例#5
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.index('{'):content.index('}')+1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))

        return urls
示例#6
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     url = r'http://cctv.cntv.cn/lm/xinwenyijiayi/video/index.shtml'
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     if result.status_code != 200 or not result.content:
         self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
         return []
         
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     
     list_pattern=re.compile(r'{\'title\':\'.*?\'<!--VIDEOSTR-->\'}', re.S)
     file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
     l=re.findall(list_pattern,content)
     tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
     for i in l[:5]:
         item=eval(i)
         try:
             pubdate = datetime.datetime.strptime(file_name_search(item["link_add"]).group(0), '%Y/%m/%d')
         except Exception as e:
             continue
         delta = tnow - pubdate
         if self.oldest_article > 0 and delta.days > self.oldest_article:
               continue
         urls.append((u'新闻1+1',item['title'],item['link_add'],None))
     return urls
     
示例#7
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "http://" ):
            url = url.replace('http://', 'https://')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        allComicTable = soup.find_all('table', {'width': '800', 'align': 'center'})

        if (allComicTable is None):
            self.log.warn('allComicTable is not exist.')
            return chapterList

        for comicTable in allComicTable:
            comicVolumes = comicTable.find_all('a', {'target': '_blank'})
            if (comicVolumes is None):
                self.log.warn('comicVolumes is not exist.')
                return chapterList

            for volume in comicVolumes:
                href = self.urljoin(self.host, volume.get("href"))
                chapterList.append((unicode(volume.string), href))

        return chapterList
示例#8
0
 def SaveToInstapaper(self, user, action, orgUrl):
     web.header('Content-type', "text/html; charset=utf-8")
     
     T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
         <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>"""
     if not user.instapaper_username or not user.instapaper_password:
         info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.')
         return info.encode('utf-8')
     
     title = web.input().get('t', '')
     name = web.input().get("n", '')
     if user.instapaper_username != name:
         info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.')
         return info.encode('utf-8')
         
     opener = URLOpener()
     password = ke_decrypt(user.instapaper_password, user.secret_key or '')
     apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 
                     'selection':'KindleEar', 'url':orgUrl}
     ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Saved to Instapaper', info)
     elif ret.status_code == 403:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     else:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code)
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     
     return info.encode('utf-8')
     
示例#9
0
    def POST(self):
        user = self.getcurrentuser(forAjax=True)
        web.header('Content-Type', 'application/json')
        webInput = web.input()
        category = webInput.get('category', '')
        title = webInput.get('title')
        feedUrl = webInput.get("url")
        isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true')
        creator = webInput.get('creator', '')

        if not title or not feedUrl:
            return json.dumps({'status': _("Title or Url is empty!")})

        opener = URLOpener()
        srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                                  SharedLibrarykindleearAppspotCom.__url__)
        data = {
            'category': category,
            'title': title,
            'url': feedUrl,
            'creator': creator,
            'isfulltext': 'true' if isfulltext else 'false',
            'key': 'kindleear.lucky!'
        }
        result = opener.open(srvUrl, data)
        if result.status_code == 200 and result.content:
            return result.content
        else:
            return json.dumps({
                'status':
                _('Cannot submit data to kindleear.appspot.com, status: %s' %
                  URLOpener.CodeMap(result.status_code))
            })
示例#10
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://www.manhuagui.com" ):
            url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        invisible_input = soup.find("input", {"id":'__VIEWSTATE'})
        if invisible_input:
            lz_encoded=invisible_input.get("value")
            lz_decoded = decompressFromBase64(lz_encoded)
            soup = BeautifulSoup(lz_decoded, 'html.parser')
        else:
            soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'})

        lias = soup.findAll('a')
        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://m.manhuagui.com" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
示例#11
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = [] # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 url = e.link
                 if url not in urladded:
                     if isfulltext:
                         desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary
                         urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?'))
                     else:
                         urls.append((section, e.title, url, None))
                     urladded.append(url)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#12
0
    def get_chapter_list_from_mobile_url(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        if "obj_id" not in content:
            self.log.warn(u"Can't find obj_id form {}".format(url))
            return []

        comic_id = re.search('obj_id = "(\d+)"', content).group(1)
        data_match = re.search("initIntroData\(([^;]+)\);", content)
        if not data_match:
            return self.get_chapter_list_from_api(comic_id)
        datas = json.loads(data_match.group(1))
        chapter_datas = []
        for data in datas:
            chapter_datas += data["data"]
        if not chapter_datas:
            return self.get_chapter_list_from_api(comic_id)
        chapter_datas.sort(key=lambda d: d["id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_name"], chapter_url))
        return chapters
示例#13
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host)
        chapterList = []

        url = url.replace("http://www.dm5.com", "https://www.manhuaren.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return chapterList

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        soup = BeautifulSoup(content, "html.parser")

        chapter_datas = []
        for link in soup.find_all("a", {"class": "chapteritem"}):
            chapter_datas.append(
                {
                    "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)),
                    "chapter_title": unicode(link.string),
                }
            )
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        for chapter in chapter_datas:
            chapter_url = "http://www.manhuaren.com/m{}/".format(chapter["chapter_id"])
            chapterList.append((chapter["chapter_title"], chapter_url))
        return chapterList
示例#14
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(url)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )
        soup = BeautifulSoup(content, "html.parser")
        scripts = soup.findAll("script", {"type": "text/javascript"})
        packed_js = None
        for script in scripts:
            if "newImgs" in script.text:
                packed_js = script.text
                break
        if not packed_js:
            self.log.warn("Can't find js")
            return []
        codes = decode_packed_codes(packed_js)
        return re.findall("'(.+?)'", codes)
示例#15
0
    def getImgList(self, chapterJson, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        cid = list(chapterJson.keys())[0]
        getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid)
        result = opener.open(getImgListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        if len(filter_result) != 0:
            base64data = filter_result[0][1:]
            img_detail_json = json.loads(base64.decodestring(base64data))
            for img_url in img_detail_json.get('picture', []):
                if ( 'url' in img_url ):
                    imgList.append(img_url['url'])
                else:
                    self.log.warn('no url in img_url:%s' % img_url)

        return imgList
示例#16
0
    def getChapterList(self, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id)
        result = opener.open(getChapterListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = result.content
        content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)

        contentJson = json.loads(content)
        count = contentJson.get('length', 0)
        if (count != 0):
            for i in range(count + 1):
                for item in contentJson:
                    if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i:
                        chapterList.append({item: contentJson[item]})
                        break
        else:
            self.log.warn('comic count is zero.')

        return chapterList
示例#17
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = set() # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed not in urladded:
                     desc = None
                     if isfulltext:
                         if hasattr(e, 'content') and e.content[0].value:
                             desc = e.content[0].value
                         elif hasattr(e, 'summary'):
                             desc = e.summary
                         else:
                             self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                     urls.append((section, e.title, urlfeed, desc))
                     urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#18
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://cctv.cntv.cn/lm/jiaodianfangtan/index.shtml'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)

        soup = BeautifulSoup(content, 'lxml')
        file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
        tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
        for li in soup.find_all('div', attrs={'class':'text'}):
            a=li.find('a')
            href = a['href']
            try:
                pubdate = datetime.datetime.strptime(file_name_search(href).group(0), '%Y/%m/%d')
            except Exception as e:
                continue
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                  continue
            urls.append((u'焦点访谈',a.string,href,None))
        return urls
        
示例#19
0
    def getImgUrlList(self, url):
        imgUrlList = []
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        sel = soup.find('select') #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return None

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return None

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)
            else:
                href = self.host + '/comic/' + ul.get('value')
                imgUrlList.append(href)

        return imgUrlList
示例#20
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.utcnow()
        urladded = set()
        for feed in self.feeds:
            section, url = feed[0], feed[1]
            partition = feed[2]
            timeout = 30
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                feed = json.loads(result.content.decode(self.feed_encoding))

                for item in feed[partition]:
                    for e in item['items']:
                        #支持HTTPS
                        urlfeed = e['share_url'].replace('http://','https://') if url.startswith('https://') else e['share_url']
                        if urlfeed in urladded:
                            self.log.warn('skipped %s' % urlfeed)
                            continue
                            
                        desc = None
                        urls.append((section, e['title'], urlfeed, desc))
                        urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
        #self.log.warn('%s' % json.dumps(urls))
        return urls
示例#21
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://m.733.so" ):
            url = url.replace('https://m.733.so', 'https://www.733.so')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find('div', {"class": "cy_plist"})
        if (soup is None):
            self.log.warn('cy_plist is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://www.733.so" + lias[rindex].get("href")
            chapterList.append((lias[rindex].get_text(), href))

        return chapterList
示例#22
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(self.feeds[0][1])
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))
            
            for partition,section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('duplicated, skipped %s' % urlfeed)
                        continue
                        
                    urls.append((section, item['title'], self.url4forwarder(urlfeed), None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url))
        return urls

    #def fetcharticle(self, url, opener, decoder):
    #    result = opener.open(self.url4forwarder(url))
    #    status_code, content = result.status_code, result.content
    #    if status_code != 200 or not content:
    #        self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code),url))
    #        return None
    #    
    #    if self.page_encoding:
    #        return content.decode(self.page_encoding)
    #    else:
    #        return decoder.decode(content,url,result.headers)
            
示例#23
0
文件: Adv.py 项目: Geekerui/KindleEar
 def POST(self, verType):
     INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate"
     web.header('Content-Type', 'application/json')
     
     respDict = {'status':'ok', 'correct':0}
     if verType.lower() != 'instapaper':
         respDict['status'] = _('Request type[%s] unsupported') % verType
         return json.dumps(respDict)
     
     user = self.getcurrentuser()
     
     username = web.input().get('username', '')
     password = web.input().get('password', '')
     opener = URLOpener()
     apiParameters = {'username': username, 'password':password}
     ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         respDict['correct'] = 1
     elif ret.status_code == 403:
         respDict['correct'] = 0
     else:
         respDict['status'] = _("The Instapaper service encountered an error. Please try again later.")
     
     return json.dumps(respDict)
     
示例#24
0
    def get_chapter_list_from_api(self, comic_id):
        opener = URLOpener(addreferer=False, timeout=60)
        json_result = opener.open(
            "http://v3api.dmzj.com/comic/{comic_id}.json".format(comic_id=comic_id)
        )

        if json_result.status_code != 200 or not json_result.content:
            self.log.info(
                "fetch v3 chapter list failed: %s, try v2" % json_result.status_code
            )
            json_result = opener.open(
                "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004".format(
                    comic_id=comic_id
                )
            )
            if json_result.status_code != 200 or not json_result.content:
                self.log.warn(
                    "fetch v2 chapter list failed: %s" % json_result.status_code
                )
                return []

        data = json.loads(json_result.content)
        chapter_datas = []
        for chapters_data in data["chapters"]:
            chapter_datas += chapters_data["data"]
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["chapter_id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_title"], chapter_url))
        return chapters
示例#25
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        scripts = soup.findAll("script", {"type": "text/javascript"})
        for script in scripts:
            if script.text != "":
                raw_content = script.text
                break

        res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1)
        lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1)
        lz_decoded = decompressFromBase64(lz_encoded)
        res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res)
        codes = self.get_node_online(res)
        pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1))

        cid = self.getChapterId(url)
        md5 = pages_opts["sl"]["md5"]
        images = pages_opts["images"]
        for img in images:
            img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5)
            imgList.append(img_url)

        return imgList
示例#26
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://opinion.people.com.cn/GB/40604/index.html'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
            
        soup = BeautifulSoup(content, 'lxml')

        box=soup.find('div', attrs={'class':'p2j_list'})
        for li in box.find_all('li'):
            a=li.find('a')
            # print a['href'],a.string
            title = a.string
            if u'人民日报' in title:
                urls.append((u'人民日报',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
            else:
                urls.append((u'人民网',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
        return urls
示例#27
0
文件: Xueqiu.py 项目: Iam42/KindleEar
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>',
                        re.I)
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
              if self.feed_encoding:
                try:
                  comment = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                  return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        comment_json = mt.group(1) if mt else None
        j = json.loads(comment_json)
        soup = BeautifulSoup(content, "lxml")
        for c in j['comments']:
            u = c['user']['screen_name']
            t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
            for img in t.find_all('img', alt=True):
                img.replace_with(t.new_string(img['alt']))
            soup.html.body.append(t.p)

        content = unicode(soup)
        return content
示例#28
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ((u"mh" in urlpaths)
                and (urlpaths.index(u"mh") + 2 < len(urlpaths))):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh") + 1]
            pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/",
                                     "") + '|{}|{}|{}|pc'.format(
                                         tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
示例#29
0
 def SendNewSubscription(self, title, url):
     opener = URLOpener()
     path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/')
     path[-1] = 'subscribedfromshared'
     srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                               '/'.join(path))
     data = {'title': title, 'url': url}
     result = opener.open(srvUrl, data)  #只管杀不管埋,不用管能否成功了
示例#30
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     tnow = datetime.datetime.utcnow()
     urladded = set()
     
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 try:
                     content = result.content.decode(self.feed_encoding)
                 except UnicodeDecodeError:
                     content = AutoDecoder(True).decode(result.content,url)
             else:
                 content = AutoDecoder(True).decode(result.content,url)
             feed = feedparser.parse(content)
             
             for e in feed['entries'][:self.max_articles_per_feed]:
                 updated = None
                 if hasattr(e, 'updated_parsed') and e.updated_parsed:
                     updated = e.updated_parsed
                 elif hasattr(e, 'published_parsed') and e.published_parsed:
                     updated = e.published_parsed
                 elif hasattr(e, 'created_parsed'):
                     updated = e.created_parsed
                     
                 if self.oldest_article > 0 and updated:
                     delta = tnow - datetime.datetime(*(updated[0:6]))
                     if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                         self.log.info("Skip old article: %s" % e.link)
                         continue
                 
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed in urladded:
                     continue
                     
                 desc = None
                 if isfulltext:
                     if hasattr(e, 'description'):
                         desc = e.description
                     elif hasattr(e, 'content') and e.content[0]['value']:
                         desc = e.content[0]['value']
                     else:
                         self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title)
                 urls.append((section, e.title, urlfeed, desc))
                 urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     
     return urls
示例#31
0
    def fetcharticle(self, url, decoder):
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(self.url4forwarder(url))
        status_code, content = result.status_code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)
示例#32
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    content = result.content.decode(self.feed_encoding)
                else:
                    content = AutoDecoder(True).decode(result.content, url)
                feed = feedparser.parse(content)

                for e in feed['entries'][:self.max_articles_per_feed]:
                    if self.oldest_article > 0 and hasattr(
                            e, 'updated_parsed'):
                        updated = e.updated_parsed
                        if updated:
                            delta = tnow - datetime(*(updated[0:6]))
                            if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                self.log.debug("article '%s' is too old" %
                                               e.title)
                                continue
                    #支持HTTPS
                    urlfeed = e.link.replace(
                        'http://',
                        'https://') if url.startswith('https://') else e.link
                    if urlfeed in urladded:
                        continue

                    desc = None
                    if isfulltext:
                        if hasattr(e, 'content') and e.content[0]['value']:
                            desc = e.content[0]['value']
                        elif hasattr(e, 'description'):
                            desc = e.description
                        else:
                            self.log.warn(
                                'fulltext feed item no has desc,link to webpage for article.(%s)'
                                % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' %
                              (result.status_code, url))

        return urls
示例#33
0
 def getRealUrl (self, url, try_count = 1):
     if try_count > 3:
         return url
     try:
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url, None, self.http_headers)
         if result.status_code > 400:
             return self.getRealUrl(url, try_count + 1)
         else:
             return opener.realurl
     except:
         return self.getRealUrl(url, try_count + 1)
示例#34
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ( (u"mh" in urlpaths) and (urlpaths.index(u"mh")+2 < len(urlpaths)) ):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh")+1]
            pid = urlpaths[urlpaths.index(u"mh")+2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
示例#35
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ((u"id" in urlpaths)
                and (urlpaths.index(u"id") + 1 < len(urlpaths))):
            comic_id = urlpaths[urlpaths.index(u"id") + 1]

        if ((not comic_id.isdigit()) or (comic_id == "")):
            self.log.warn('can not get comic id: %s' % url)
            return chapterList

        url = 'https://m.ac.qq.com/comic/chapterList/id/{}'.format(comic_id)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        # <section class="chapter-list-box list-expanded" data-vip-free="1">
        section = soup.find('section',
                            {'class': 'chapter-list-box list-expanded'})
        if (section is None):
            self.log.warn('chapter-list-box is not exist.')
            return chapterList

        # <ul class="chapter-list normal">
        # <ul class="chapter-list reverse">
        reverse_list = section.find('ul', {'class': 'chapter-list reverse'})
        if (reverse_list is None):
            self.log.warn('chapter-list is not exist.')
            return chapterList

        for item in reverse_list.find_all('a'):
            # <a class="chapter-link lock" data-cid="447" data-seq="360" href="/chapter/index/id/531490/cid/447">360</a>
            # https://m.ac.qq.com/chapter/index/id/511915/cid/1
            href = 'https://m.ac.qq.com' + item.get('href')
            isVip = "lock" in item.get('class')
            if isVip == True:
                self.log.info(
                    "Chapter {} is Vip, waiting for free.".format(href))
                continue

            chapterList.append((item.get_text(), href))

        return chapterList
示例#36
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(self.url4forwarder(url))
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         return content.decode(self.page_encoding)
     else:
         return decoder.decode(content,url)
         
示例#37
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        url = self.host + "/comic/" + url
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        return comicImgTag.get('src') if comicImgTag else None
示例#38
0
    def getImgUrl(self, url):
        opener = URLOpener(self.host, timeout=60)
        headers = {
            'Host':
            "img_733.234us.com",
            'User-Agent':
            'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25'
        }
        result = opener.open(url, headers=headers)
        if result.status_code != 200 or opener.realurl == url:
            self.log.warn('can not get real comic url for : %s' % url)
            return None

        return opener.realurl
示例#39
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding,
                                          opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        # "picture": [{},...{}]}
        if len(filter_result) != 0:
            # "picture" > InBpY3R1cmUi
            # picture": > cGljdHVyZSI6
            # icture":[ > aWN0dXJlIjpb
            if "InBpY3R1cmUi" in filter_result[0]:
                base64data = filter_result[0].split("InBpY3R1cmUi")[1]
                self.log.warn('found flag string: %s' % "InBpY3R1cmUi")
            elif "cGljdHVyZSI6" in filter_result[0]:
                base64data = filter_result[0].split("cGljdHVyZSI6")[1]
                self.log.warn('found flag string: %s' % "cGljdHVyZSI6")
            elif "aWN0dXJlIjpb" in filter_result[0]:
                base64data = filter_result[0].split("aWN0dXJl")[1]
                self.log.warn('found flag string: %s' % "aWN0dXJlIjpb")
            else:
                self.log.warn('can not found flag string in data: %s' %
                              filter_result[0])
                return imgList
            decodeData = base64.decodestring(base64data)
            startIndex = decodeData.find('[')
            endIndex = decodeData.find(']')

            if startIndex > -1 and endIndex > -1:
                img_detail_json = json.loads(decodeData[startIndex:endIndex +
                                                        1])
                for img_url in img_detail_json:
                    if ('url' in img_url):
                        imgList.append(img_url['url'])
                    else:
                        self.log.warn('no url in img_url:%s' % img_url)
            else:
                self.log.warn('can not found [] in decodeData:%s' % decodeData)
        else:
            self.log.warn('can not fount filter_result with data: .')

        return imgList
示例#40
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        url = self.host + "/comic/" + url
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        return comicImgTag.get('src') if comicImgTag else None
示例#41
0
 def getPageContent(u_url):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(u_url)
     if result.status_code != 200 or not result.content:
         self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, u_url))
         return ''
         
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     return content
示例#42
0
    def fetcharticle(self, url, decoder):
        #使用同步方式获取一篇文章
        if self.fulltext_by_instapaper and not self.fulltext_by_readability:
            url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)

        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.status_code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)
示例#43
0
 def fetcharticle(self, url, decoder):
     #使用同步方式获取一篇文章
     if self.fulltext_by_instapaper and not self.fulltext_by_readability:
         url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)
     
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         return content.decode(self.page_encoding)
     else:
         return decoder.decode(content,url)
示例#44
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = self.feeds[0][1]
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' %
                          (result.status_code, url))
            return []

        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,
                                                    opener.realurl,
                                                    result.headers)
        else:
            content = AutoDecoder(False).decode(result.content, opener.realurl,
                                                result.headers)

        soup = BeautifulSoup(content, 'lxml')
        for article in soup.find_all('div', attrs={'class': 'post'}):
            title = article.find('a', attrs={'class': 'title'})
            if not title or not title.string.startswith(u'安邦'):
                continue

            #获取发布时间
            pubdate = article.find('span', attrs={'class': 'date'})
            if not pubdate:
                continue
            mt = re.match(ur'(\d{4})年(\d{1,2})月(\d{1,2})日', pubdate.string)
            if not mt:
                continue
            pubdate = datetime.datetime(int(mt.group(1)), int(mt.group(2)),
                                        int(mt.group(3)))

            #确定文章是否需要推送,时区固定为北京时间
            tnow = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                continue

            urls.append((u'安邦咨询', title.string, title['href'], None))

        return urls
示例#45
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        sel = soup.find('select') #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return imgList

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return imgList

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)

        listLen = len(ulist)
        firstPageTag = soup.find('img', {'oncontextmenu': 'return false'})
        firstPage = firstPageTag.get('src') if firstPageTag else None

        if firstPage != None:
            firstPage = "https://www.cartoonmad.com/{}".format(firstPage)
            base, length, type = self.getImgStr(firstPage)
            for index in range(len(ulist)):
                imgUrl = "{}{}.{}".format(base, str(index+1).zfill(length), type)
                imgList.append(imgUrl)

        if imgList[0] == firstPage and imgList[listLen-1] == self.getImgUrl(ulist[listLen-1].get('value')):
            return imgList
        else:
            imgList = []
            for ul in ulist:
                imgList.append("https://www.cartoonmad.com/{}".format(self.getImgUrl(ul.get('value'))))
            return imgList

        return imgList
示例#46
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        #var chapterPath = "images/comic/31/61188/";
        chapterPath = re.search(
            r'(var chapterPath = ")(.*)(";var chapterPrice)', content)
        if (chapterPath is None):
            self.log.warn('var chapterPath is not exist.')
            return imgList
        else:
            chapterPath = chapterPath.group(2)

        #var pageImage = "https://res.gufengmh.com/gufeng/images/";
        imgPrefix = re.search(r'(var pageImage = ")(.*)(gufeng/images/)',
                              content)
        if (imgPrefix is None):
            self.log.warn(
                '"https://res.gufengmh.com/gufeng/images/ is not exist.')
            return imgList
        else:
            imgPrefix = imgPrefix.group(2) + "/"

        #var chapterImages = ["",""];
        images = re.search(r'(var chapterImages = \[)(.*)(\];)', content)
        if (images is None):
            self.log.warn('var chapterImages is not exist.')
            return imgList
        else:
            images = images.group(2).split(',')

        for img in images:
            img_url = imgPrefix + chapterPath + img.replace("\"", "")
            imgList.append(img_url)

        return imgList
示例#47
0
    def getChapterList(self, url):
        if url.startswith("https://m.dmzj.com"):
            return self.get_chapter_list_from_mobile_url(url)
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)
        chapterList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        comic_id = re.search('g_comic_id = "([^"]+)', content).group(1)

        # try get chapters from html
        soup = BeautifulSoup(content, "html.parser")
        chapter_datas = []
        for comic_classname in [
                "cartoon_online_border", "cartoon_online_border_other"
        ]:
            divs = soup.find_all("div", attrs={"class": comic_classname})
            if not divs:
                continue
            for div in divs:
                for link in div.find_all("a"):
                    chapter_datas.append({
                        "chapter_id":
                        int(
                            re.search("\/(\d+)\.shtml",
                                      link.get("href")).group(1)),
                        "chapter_title":
                        unicode(link.string),
                    })
        if chapter_datas:
            chapter_datas.sort(key=lambda d: d["chapter_id"])
            for chapter in chapter_datas:
                chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                    chapter_id=chapter["chapter_id"], comic_id=comic_id)
                chapterList.append((chapter["chapter_title"], chapter_url))
            return chapterList
        else:
            return self.get_chapter_list_from_api(comic_id)
示例#48
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        for feed in self.feeds:
            feedtitle,url = feed[0],feed[1]
            opener = URLOpener(self.host, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code != 200 or not result.content:
                self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
                continue

            if self.feed_encoding:
                try:
                    content = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                    content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
            else:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)

            soup = BeautifulSoup(content, 'lxml')
            for article in soup.find_all('div', attrs={'class':'feed_item_question'}):
                title = article.find('a', attrs={'class':'question_link'})
                if not title:
                    continue

                #获取发布时间
                pubdate = article.find('span',attrs={'class':'timestamp'})
                if not pubdate:
                    continue
                try:
                    pubdate = datetime.datetime.strptime(pubdate.string, '%Y-%m-%d')
                except Exception as e:
                    self.log.warn('parse pubdate failed for [%s] : %s'%(url,str(e)))
                    continue

                #确定文章是否需要推送,时区固定为北京时间
                tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
                delta = tnow - pubdate
                if self.oldest_article > 0 and delta.days > self.oldest_article:
                    continue

                href = title['href'] if title['href'].startswith('http') else self.urljoin(url,title['href'])

                urls.append((feedtitle,string_of_tag(title),href,None))

        return urls
示例#49
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        try:
            # function base64decode(str){*};
            func = re.search(r'function\ base64decode\(str\){.*};',
                             content).group()
            func = func.split('base64decode')[1].replace('};', '}')

            # packed="*";
            packed = re.search(r'packed=".*";', content).group()
            packed = packed.split('\"')[1]
        except:
            self.log.warn('var photosr is not exist.')
            return imgList

        # eval(function(str){*}("*").slice(4))
        lz_input = "eval(function{}(\"{}\").slice(4))".format(func, packed)
        lz_nodejs = self.get_node_online(lz_input)

        if (lz_nodejs is None):
            self.log.warn('image list is not exist.')
            return imgList

        # photosr[1]="images/2019/11/08/09/19904f5d64.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0";
        images = lz_nodejs.split("\"")
        # http://res.img.220012.net/2017/08/22/13/343135d67f.jpg
        for img in images:
            if ".jpg" in img:
                img_url = self.urljoin("http://res.img.220012.net", img)
                imgList.append(img_url)

        return imgList
示例#50
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        try:
            func = re.search(r'function\ base64decode\(str\){.*};',
                             content).group()
            packed = re.search(r'packed=".*";', content).group()
        except:
            self.log.warn('var photosr is not exist in {}.'.format(url))
            return imgList

        # eval(function(str){*}("*").slice(4))
        lz_input = "{}var photosr = new Array();{}console.log(eval(base64decode(packed).slice(4)));".format(
            func, packed)
        lz_nodejs = self.get_node_online(lz_input)
        if (lz_nodejs is None):
            self.log.warn('image list is not exist.')
            return imgList

        images = lz_nodejs.split("\"")
        self.log.info(images)
        for img in images:
            # photosr[1]="images/2020/05/03/17/516bbfddb4.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0";
            # http://res.img.fffimage.com/images/2020/05/03/17/516bbfddb4.jpg/0

            # photosr[1]="images/2020/04/21/09/3706a024c8.png/0";...photosr[12]="images/2020/04/21/09/3732355905.png/0";
            # http://res.img.fffimage.com/images/2020/04/21/09/3706a024c8.png/0
            if ".jpg" in img or ".png" in img:
                img_url = self.urljoin("http://res.img.fffimage.com/", img)
                imgList.append(img_url)

        return imgList
示例#51
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith("https://www.gufengmh.com"):
            url = url.replace('https://www.gufengmh.com',
                              'https://m.gufengmh.com')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        #<ul class="Drama autoHeight" data-sort="asc" id="chapter-list-1">
        soup = soup.find('ul', {
            "class": "Drama autoHeight",
            "id": "chapter-list-1"
        })
        if (soup is None):
            self.log.warn('chapter-list is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for index, a in enumerate(lias):
            href = self.urljoin("https://m.gufengmh.com", a.get('href', ''))
            span = a.find("span")
            if span is None:
                chapterList.append((u'第%d话' % (index + 1), href))
            else:
                chapterList.append((unicode(span.contents[0]), href))

        return chapterList
示例#52
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        url = url.replace("https://www.manhuagui.com", "https://m.manhuagui.com")
        url = url.replace("https://tw.manhuagui.com", "https://m.manhuagui.com")

        for njRetry in range(10):
            try:
                result = opener.open(url)
                if result.status_code == 200 and result.content:
                    break
            except Exception, e:
                if njRetry < 5:
                    self.log.warn('fetch comic page failed: %s, retry' % url)
                    continue
                else:
                    self.log.warn('fetch comic page failed: %s' % url)
                    return chapterList