示例#1
0
    def getChapterList(self, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id)
        result = opener.open(getChapterListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = result.content
        content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)

        contentJson = json.loads(content)
        count = contentJson.get('length', 0)
        if (count != 0):
            for i in range(count + 1):
                for item in contentJson:
                    if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i:
                        chapterList.append({item: contentJson[item]})
                        break
        else:
            self.log.warn('comic count is zero.')

        return chapterList
示例#2
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host)
        chapterList = []

        url = url.replace("http://www.dm5.com", "https://www.manhuaren.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return chapterList

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        soup = BeautifulSoup(content, "html.parser")

        chapter_datas = []
        for link in soup.find_all("a", {"class": "chapteritem"}):
            chapter_datas.append(
                {
                    "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)),
                    "chapter_title": unicode(link.string),
                }
            )
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        for chapter in chapter_datas:
            chapter_url = "http://www.manhuaren.com/m{}/".format(chapter["chapter_id"])
            chapterList.append((chapter["chapter_title"], chapter_url))
        return chapterList
示例#3
0
    def getImgList(self, chapterJson, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        cid = list(chapterJson.keys())[0]
        getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid)
        result = opener.open(getImgListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        if len(filter_result) != 0:
            base64data = filter_result[0][1:]
            img_detail_json = json.loads(base64.decodestring(base64data))
            for img_url in img_detail_json.get('picture', []):
                if ( 'url' in img_url ):
                    imgList.append(img_url['url'])
                else:
                    self.log.warn('no url in img_url:%s' % img_url)

        return imgList
示例#4
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = set() # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed not in urladded:
                     desc = None
                     if isfulltext:
                         if hasattr(e, 'content') and e.content[0].value:
                             desc = e.content[0].value
                         elif hasattr(e, 'summary'):
                             desc = e.summary
                         else:
                             self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                     urls.append((section, e.title, urlfeed, desc))
                     urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#5
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://cctv.cntv.cn/lm/jiaodianfangtan/index.shtml'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)

        soup = BeautifulSoup(content, 'lxml')
        file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
        tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
        for li in soup.find_all('div', attrs={'class':'text'}):
            a=li.find('a')
            href = a['href']
            try:
                pubdate = datetime.datetime.strptime(file_name_search(href).group(0), '%Y/%m/%d')
            except Exception as e:
                continue
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                  continue
            urls.append((u'焦点访谈',a.string,href,None))
        return urls
        
示例#6
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(url)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )
        soup = BeautifulSoup(content, "html.parser")
        scripts = soup.findAll("script", {"type": "text/javascript"})
        packed_js = None
        for script in scripts:
            if "newImgs" in script.text:
                packed_js = script.text
                break
        if not packed_js:
            self.log.warn("Can't find js")
            return []
        codes = decode_packed_codes(packed_js)
        return re.findall("'(.+?)'", codes)
示例#7
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://m.733.so" ):
            url = url.replace('https://m.733.so', 'https://www.733.so')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find('div', {"class": "cy_plist"})
        if (soup is None):
            self.log.warn('cy_plist is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://www.733.so" + lias[rindex].get("href")
            chapterList.append((lias[rindex].get_text(), href))

        return chapterList
示例#8
0
def BaiduPanHandler(url):
    import json
    o = urlparse.urlparse(url)
    if not o[1] or not o[1].endswith(('pan.baidu.com','yun.baidu.com')):
        return None
    
    #为了简单起见,这里使用网友制作的网站获取真实链接
    #后续为了减少依赖,可以借鉴
    #https://github.com/banbanchs/pan-baidu-download
    #和 https://github.com/xuanqinanhai/bleed-baidu-white
    #将代码集成过来
    url = 'http://daimajia.duapp.com/baidu/?url=%s' % url
    opener = URLOpener()
    result = opener.open(url)
    if result.status_code != 200 or not result.content:
        return None
    linkinfo = json.loads(result.content.decode('utf-8'))
    filename = linkinfo.get('name')
    if '\u' in filename:
        try:
            filename = filename.decode('unicode-escape')
        except:
            pass
    link = linkinfo.get('download')
    
    return (filename,link) if link else None
示例#9
0
文件: Xueqiu.py 项目: Iam42/KindleEar
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>',
                        re.I)
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
              if self.feed_encoding:
                try:
                  comment = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                  return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        comment_json = mt.group(1) if mt else None
        j = json.loads(comment_json)
        soup = BeautifulSoup(content, "lxml")
        for c in j['comments']:
            u = c['user']['screen_name']
            t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
            for img in t.find_all('img', alt=True):
                img.replace_with(t.new_string(img['alt']))
            soup.html.body.append(t.p)

        content = unicode(soup)
        return content
示例#10
0
    def get_chapter_list_from_mobile_url(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        if "obj_id" not in content:
            self.log.warn(u"Can't find obj_id form {}".format(url))
            return []

        comic_id = re.search('obj_id = "(\d+)"', content).group(1)
        data_match = re.search("initIntroData\(([^;]+)\);", content)
        if not data_match:
            return self.get_chapter_list_from_api(comic_id)
        datas = json.loads(data_match.group(1))
        chapter_datas = []
        for data in datas:
            chapter_datas += data["data"]
        if not chapter_datas:
            return self.get_chapter_list_from_api(comic_id)
        chapter_datas.sort(key=lambda d: d["id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_name"], chapter_url))
        return chapters
示例#11
0
    def get_chapter_list_from_api(self, comic_id):
        opener = URLOpener(addreferer=False, timeout=60)
        json_result = opener.open(
            "http://v3api.dmzj.com/comic/{comic_id}.json".format(comic_id=comic_id)
        )

        if json_result.status_code != 200 or not json_result.content:
            self.log.info(
                "fetch v3 chapter list failed: %s, try v2" % json_result.status_code
            )
            json_result = opener.open(
                "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004".format(
                    comic_id=comic_id
                )
            )
            if json_result.status_code != 200 or not json_result.content:
                self.log.warn(
                    "fetch v2 chapter list failed: %s" % json_result.status_code
                )
                return []

        data = json.loads(json_result.content)
        chapter_datas = []
        for chapters_data in data["chapters"]:
            chapter_datas += chapters_data["data"]
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["chapter_id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_title"], chapter_url))
        return chapters
示例#12
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = [] # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 url = e.link
                 if url not in urladded:
                     if isfulltext:
                         desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary
                         urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?'))
                     else:
                         urls.append((section, e.title, url, None))
                     urladded.append(url)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
示例#13
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     url = r'http://cctv.cntv.cn/lm/xinwenyijiayi/video/index.shtml'
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     if result.status_code != 200 or not result.content:
         self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
         return []
         
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     
     list_pattern=re.compile(r'{\'title\':\'.*?\'<!--VIDEOSTR-->\'}', re.S)
     file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
     l=re.findall(list_pattern,content)
     tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
     for i in l[:5]:
         item=eval(i)
         try:
             pubdate = datetime.datetime.strptime(file_name_search(item["link_add"]).group(0), '%Y/%m/%d')
         except Exception as e:
             continue
         delta = tnow - pubdate
         if self.oldest_article > 0 and delta.days > self.oldest_article:
               continue
         urls.append((u'新闻1+1',item['title'],item['link_add'],None))
     return urls
     
示例#14
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "http://" ):
            url = url.replace('http://', 'https://')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        allComicTable = soup.find_all('table', {'width': '800', 'align': 'center'})

        if (allComicTable is None):
            self.log.warn('allComicTable is not exist.')
            return chapterList

        for comicTable in allComicTable:
            comicVolumes = comicTable.find_all('a', {'target': '_blank'})
            if (comicVolumes is None):
                self.log.warn('comicVolumes is not exist.')
                return chapterList

            for volume in comicVolumes:
                href = self.urljoin(self.host, volume.get("href"))
                chapterList.append((unicode(volume.string), href))

        return chapterList
示例#15
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://opinion.people.com.cn/GB/40604/index.html'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
            
        soup = BeautifulSoup(content, 'lxml')

        box=soup.find('div', attrs={'class':'p2j_list'})
        for li in box.find_all('li'):
            a=li.find('a')
            # print a['href'],a.string
            title = a.string
            if u'人民日报' in title:
                urls.append((u'人民日报',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
            else:
                urls.append((u'人民网',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
        return urls
示例#16
0
 def SaveToInstapaper(self, user, action, orgUrl):
     web.header('Content-type', "text/html; charset=utf-8")
     
     T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
         <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>"""
     if not user.instapaper_username or not user.instapaper_password:
         info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.')
         return info.encode('utf-8')
     
     title = web.input().get('t', '')
     name = web.input().get("n", '')
     if user.instapaper_username != name:
         info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.')
         return info.encode('utf-8')
         
     opener = URLOpener()
     password = ke_decrypt(user.instapaper_password, user.secret_key or '')
     apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 
                     'selection':'KindleEar', 'url':orgUrl}
     ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Saved to Instapaper', info)
     elif ret.status_code == 403:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     else:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code)
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     
     return info.encode('utf-8')
     
示例#17
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        scripts = soup.findAll("script", {"type": "text/javascript"})
        for script in scripts:
            if script.text != "":
                raw_content = script.text
                break

        res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1)
        lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1)
        lz_decoded = decompressFromBase64(lz_encoded)
        res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res)
        codes = self.get_node_online(res)
        pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1))

        cid = self.getChapterId(url)
        md5 = pages_opts["sl"]["md5"]
        images = pages_opts["images"]
        for img in images:
            img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5)
            imgList.append(img_url)

        return imgList
示例#18
0
文件: Adv.py 项目: Geekerui/KindleEar
 def POST(self, verType):
     INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate"
     web.header('Content-Type', 'application/json')
     
     respDict = {'status':'ok', 'correct':0}
     if verType.lower() != 'instapaper':
         respDict['status'] = _('Request type[%s] unsupported') % verType
         return json.dumps(respDict)
     
     user = self.getcurrentuser()
     
     username = web.input().get('username', '')
     password = web.input().get('password', '')
     opener = URLOpener()
     apiParameters = {'username': username, 'password':password}
     ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         respDict['correct'] = 1
     elif ret.status_code == 403:
         respDict['correct'] = 0
     else:
         respDict['status'] = _("The Instapaper service encountered an error. Please try again later.")
     
     return json.dumps(respDict)
     
示例#19
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         content = content.decode('utf-8')
     else:
         content = decoder.decode(content,url)
     
     m = re.search(r'<iframe.*?src="(.*?)".*?>', content)
     if m:
         newurl = m.group(1)
         result = opener.open(newurl)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             self.log.warn('fetch article failed(%d):%s.' % (status_code,newurl))
             return None
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content,newurl)
     
     return content
示例#20
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://www.manhuagui.com" ):
            url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        invisible_input = soup.find("input", {"id":'__VIEWSTATE'})
        if invisible_input:
            lz_encoded=invisible_input.get("value")
            lz_decoded = decompressFromBase64(lz_encoded)
            soup = BeautifulSoup(lz_decoded, 'html.parser')
        else:
            soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'})

        lias = soup.findAll('a')
        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://m.manhuagui.com" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
示例#21
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(self.feeds[0][1])
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))
            
            for partition,section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('duplicated, skipped %s' % urlfeed)
                        continue
                        
                    urls.append((section, item['title'], self.url4forwarder(urlfeed), None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url))
        return urls

    #def fetcharticle(self, url, opener, decoder):
    #    result = opener.open(self.url4forwarder(url))
    #    status_code, content = result.status_code, result.content
    #    if status_code != 200 or not content:
    #        self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code),url))
    #        return None
    #    
    #    if self.page_encoding:
    #        return content.decode(self.page_encoding)
    #    else:
    #        return decoder.decode(content,url,result.headers)
            
示例#22
0
    def getImgUrlList(self, url):
        imgUrlList = []
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        sel = soup.find('select') #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return None

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return None

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)
            else:
                href = self.host + '/comic/' + ul.get('value')
                imgUrlList.append(href)

        return imgUrlList
示例#23
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.utcnow()
        urladded = set()
        for feed in self.feeds:
            section, url = feed[0], feed[1]
            partition = feed[2]
            timeout = 30
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                feed = json.loads(result.content.decode(self.feed_encoding))

                for item in feed[partition]:
                    for e in item['items']:
                        #支持HTTPS
                        urlfeed = e['share_url'].replace('http://','https://') if url.startswith('https://') else e['share_url']
                        if urlfeed in urladded:
                            self.log.warn('skipped %s' % urlfeed)
                            continue
                            
                        desc = None
                        urls.append((section, e['title'], urlfeed, desc))
                        urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
        #self.log.warn('%s' % json.dumps(urls))
        return urls
示例#24
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.index('{'):content.index('}')+1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))

        return urls
示例#25
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     tnow = datetime.datetime.utcnow()
     urladded = set()
     
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 try:
                     content = result.content.decode(self.feed_encoding)
                 except UnicodeDecodeError:
                     content = AutoDecoder(True).decode(result.content,url)
             else:
                 content = AutoDecoder(True).decode(result.content,url)
             feed = feedparser.parse(content)
             
             for e in feed['entries'][:self.max_articles_per_feed]:
                 updated = None
                 if hasattr(e, 'updated_parsed') and e.updated_parsed:
                     updated = e.updated_parsed
                 elif hasattr(e, 'published_parsed') and e.published_parsed:
                     updated = e.published_parsed
                 elif hasattr(e, 'created_parsed'):
                     updated = e.created_parsed
                     
                 if self.oldest_article > 0 and updated:
                     delta = tnow - datetime.datetime(*(updated[0:6]))
                     if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                         self.log.info("Skip old article: %s" % e.link)
                         continue
                 
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed in urladded:
                     continue
                     
                 desc = None
                 if isfulltext:
                     if hasattr(e, 'description'):
                         desc = e.description
                     elif hasattr(e, 'content') and e.content[0]['value']:
                         desc = e.content[0]['value']
                     else:
                         self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title)
                 urls.append((section, e.title, urlfeed, desc))
                 urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     
     return urls
示例#26
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ( (u"mh" in urlpaths) and (urlpaths.index(u"mh")+2 < len(urlpaths)) ):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh")+1]
            pid = urlpaths[urlpaths.index(u"mh")+2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
示例#27
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(self.url4forwarder(url))
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         return content.decode(self.page_encoding)
     else:
         return decoder.decode(content,url)
         
示例#28
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        url = self.host + "/comic/" + url
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        return comicImgTag.get('src') if comicImgTag else None
示例#29
0
 def getPageContent(u_url):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(u_url)
     if result.status_code != 200 or not result.content:
         self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, u_url))
         return ''
         
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     return content
示例#30
0
 def fetcharticle(self, url, decoder):
     #使用同步方式获取一篇文章
     if self.fulltext_by_instapaper and not self.fulltext_by_readability:
         url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)
     
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         return content.decode(self.page_encoding)
     else:
         return decoder.decode(content,url)