Exemplo n.º 1
0
    def fetcharticle(self, url, decoder):
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.status_code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if self.page_encoding:
            content = content.decode('utf-8')
        else:
            content = decoder.decode(content, url)

        m = re.search(r'<iframe.*?src="(.*?)".*?>', content)
        if m:
            newurl = m.group(1)
            result = opener.open(newurl)
            status_code, content = result.status_code, result.content
            if status_code != 200 or not content:
                self.log.warn('fetch article failed(%d):%s.' %
                              (status_code, newurl))
                return None

            if self.page_encoding:
                content = content.decode(self.page_encoding)
            else:
                content = decoder.decode(content, newurl)

        return content
Exemplo n.º 2
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.page_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        if (comicImgTag is None):
            self.log.warn('can not find image href.')
            return None
        imgUrl = self.host + "/comic/" + comicImgTag.get('src')

        headers = {'Referer': url}
        result = opener.open(imgUrl, headers=headers)
        if result.status_code != 200 or opener.realurl == url:
            self.log.warn('can not get real comic url for : %s' % url)
            return None

        return opener.realurl
Exemplo n.º 3
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         content = content.decode('utf-8')
     else:
         content = decoder.decode(content,url)
     
     m = re.search(r'<iframe.*?src="(.*?)".*?>', content)
     if m:
         newurl = m.group(1)
         result = opener.open(newurl)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             self.log.warn('fetch article failed(%d):%s.' % (status_code,newurl))
             return None
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content,newurl)
     
     return content
Exemplo n.º 4
0
    def get_chapter_list_from_api(self, comic_id):
        opener = URLOpener(addreferer=False, timeout=60)
        json_result = opener.open(
            "http://v3api.dmzj.com/comic/{comic_id}.json".format(
                comic_id=comic_id))

        if json_result.status_code != 200 or not json_result.content:
            self.log.info("fetch v3 chapter list failed: %s, try v2" %
                          json_result.status_code)
            json_result = opener.open(
                "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004"
                .format(comic_id=comic_id))
            if json_result.status_code != 200 or not json_result.content:
                self.log.warn("fetch v2 chapter list failed: %s" %
                              json_result.status_code)
                return []

        data = json.loads(json_result.content)
        chapter_datas = []
        for chapters_data in data["chapters"]:
            chapter_datas += chapters_data["data"]
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["chapter_id"], comic_id=comic_id)
            chapters.append((chapter["chapter_title"], chapter_url))
        return chapters
Exemplo n.º 5
0
    def get_chapter_list_from_api(self, comic_id):
        opener = URLOpener(addreferer=False, timeout=60)
        json_result = opener.open(
            "http://v3api.dmzj.com/comic/{comic_id}.json".format(comic_id=comic_id)
        )

        if json_result.status_code != 200 or not json_result.content:
            self.log.info(
                "fetch v3 chapter list failed: %s, try v2" % json_result.status_code
            )
            json_result = opener.open(
                "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004".format(
                    comic_id=comic_id
                )
            )
            if json_result.status_code != 200 or not json_result.content:
                self.log.warn(
                    "fetch v2 chapter list failed: %s" % json_result.status_code
                )
                return []

        data = json.loads(json_result.content)
        chapter_datas = []
        for chapters_data in data["chapters"]:
            chapter_datas += chapters_data["data"]
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["chapter_id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_title"], chapter_url))
        return chapters
Exemplo n.º 6
0
    def ParseFeedUrls(self):
        urls = []  #用于返回

        newComicUrls = self.GetNewComic()  #返回[(title, num, url),...]
        if not newComicUrls:
            return []

        decoder = AutoDecoder(isfeed=False)
        for title, num, url in newComicUrls:
            opener = URLOpener(self.host, timeout=60)
            result = opener.open(url)
            if result.status_code != 200 or not result.content:
                self.log.warn('fetch comic page failed: %s' % url)
                continue

            content = result.content
            content = self.AutoDecodeContent(content, decoder,
                                             self.page_encoding,
                                             opener.realurl, result.headers)

            bodySoup = BeautifulSoup(content, 'lxml')
            sel = bodySoup.find('select')  #页码行,要提取所有的页面
            ul = sel.find_all('option') if sel else None
            if not ul:
                continue

            for comicPage in ul:
                href = comicPage.get('value')
                if href:
                    pageHref = self.urljoin(url, href)
                    result = opener.open(pageHref)
                    if result.status_code != 200:
                        self.log.warn('fetch comic page failed: %s' % pageHref)
                        continue

                    content = result.content
                    content = self.AutoDecodeContent(content, decoder,
                                                     self.page_encoding,
                                                     opener.realurl,
                                                     result.headers)
                    soup = BeautifulSoup(content, 'lxml')

                    comicImgTag = soup.find('img',
                                            {'oncontextmenu': 'return false'})
                    comicSrc = comicImgTag.get('src') if comicImgTag else None
                    if comicSrc:
                        urls.append((title, comicPage.text, comicSrc, None))

            self.UpdateLastDelivered(title, num)

        return urls
Exemplo n.º 7
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith("https://m.733.so"):
            url = url.replace('https://m.733.so', 'https://www.733.so')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find('div', {"class": "cy_plist"})
        if (soup is None):
            self.log.warn('cy_plist is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for aindex in range(len(lias)):
            rindex = len(lias) - 1 - aindex
            href = "https://www.733.so" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
Exemplo n.º 8
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            imgb64 = b64encode(img.replace("http://www.baidu1.com/", ""))
            img_url = u'http://new.234us.com:8989/img_new.php?data={}'.format(
                imgb64)
            imgList.append(img_url)

        return imgList
Exemplo n.º 9
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(url)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )
        soup = BeautifulSoup(content, "html.parser")
        scripts = soup.findAll("script", {"type": "text/javascript"})
        packed_js = None
        for script in scripts:
            if "newImgs" in script.text:
                packed_js = script.text
                break
        if not packed_js:
            self.log.warn("Can't find js")
            return []
        codes = decode_packed_codes(packed_js)
        return re.findall("'(.+?)'", codes)
Exemplo n.º 10
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host)
        chapterList = []

        url = url.replace("http://www.dm5.com", "https://www.manhuaren.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".format(
                    url, result.status_code, result.content
                )
            )
            return chapterList

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        soup = BeautifulSoup(content, "html.parser")

        chapter_datas = []
        for link in soup.find_all("a", {"class": "chapteritem"}):
            chapter_datas.append(
                {
                    "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)),
                    "chapter_title": unicode(link.string),
                }
            )
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        for chapter in chapter_datas:
            chapter_url = "http://www.manhuaren.com/m{}/".format(chapter["chapter_id"])
            chapterList.append((chapter["chapter_title"], chapter_url))
        return chapterList
Exemplo n.º 11
0
 def ParseFeedUrls(self):
     urls = [] # 定义一个空的列表用来存放文章元组
     # 循环处理fees中两个主题页面
     for feed in self.feeds:
         # 分别获取元组中主题的名称和链接
         topic, url = feed[0], feed[1]
         # 请求主题链接并获取相应内容
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url)
         # 如果请求成功,并且页面内容不为空
         if result.status_code == 200 and result.content:
             # 将页面内容转换成BeatifulSoup对象
             soup = BeautifulSoup(result.content, 'lxml')
             # 找出当前页面文章列表中所有文章条目
             items=soup.find('div',class_='grid').find_all(name='div', class_='content')
             # 循环处理每个文章条目
             for item in items:
                 title = item.span.string # 获取文章标题
                 link = item.a.get('href') # 获取文章链接
                 link = BaseFeedBook.urljoin(url, link) # 合成文章链接
                 if self.OutTimeRange(item):
                     continue
                 urls.append((topic, title, link, None)) # 把文章元组加入列表
         # 如果请求失败通知到日志输出中
         else:
             self.log.warn('Fetch article failed(%s):%s' % \
                 (URLOpener.CodeMap(result.status_code), url))
     # 返回提取到的所有文章列表
     return urls
Exemplo n.º 12
0
    def getChapterList(self, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id)
        result = opener.open(getChapterListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = result.content
        content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)

        contentJson = json.loads(content)
        count = contentJson.get('length', 0)
        if (count != 0):
            for i in range(count + 1):
                for item in contentJson:
                    if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i:
                        chapterList.append({item: contentJson[item]})
                        break
        else:
            self.log.warn('comic count is zero.')

        return chapterList
Exemplo n.º 13
0
    def getImgList(self, chapterJson, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        cid = list(chapterJson.keys())[0]
        getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid)
        result = opener.open(getImgListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        if len(filter_result) != 0:
            base64data = filter_result[0][1:]
            img_detail_json = json.loads(base64.decodestring(base64data))
            for img_url in img_detail_json.get('picture', []):
                if ( 'url' in img_url ):
                    imgList.append(img_url['url'])
                else:
                    self.log.warn('no url in img_url:%s' % img_url)

        return imgList
Exemplo n.º 14
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith("http://"):
            url = url.replace('http://', 'https://')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        allComicTable = soup.find_all('table', {
            'width': '800',
            'align': 'center'
        })
        for comicTable in allComicTable:
            comicVolumes = comicTable.find_all('a', {'target': '_blank'})
            for volume in comicVolumes:
                href = self.urljoin(self.host, volume.get('href'))
                chapterList.append(href)

        return chapterList
Exemplo n.º 15
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = set() # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed not in urladded:
                     desc = None
                     if isfulltext:
                         if hasattr(e, 'content') and e.content[0].value:
                             desc = e.content[0].value
                         elif hasattr(e, 'summary'):
                             desc = e.summary
                         else:
                             self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                     urls.append((section, e.title, urlfeed, desc))
                     urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
Exemplo n.º 16
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://cctv.cntv.cn/lm/jiaodianfangtan/index.shtml'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)

        soup = BeautifulSoup(content, 'lxml')
        file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
        tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
        for li in soup.find_all('div', attrs={'class':'text'}):
            a=li.find('a')
            href = a['href']
            try:
                pubdate = datetime.datetime.strptime(file_name_search(href).group(0), '%Y/%m/%d')
            except Exception as e:
                continue
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                  continue
            urls.append((u'焦点访谈',a.string,href,None))
        return urls
        
Exemplo n.º 17
0
    def get_chapter_list_from_mobile_url(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return []

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        if "obj_id" not in content:
            self.log.warn(u"Can't find obj_id form {}".format(url))
            return []

        comic_id = re.search('obj_id = "(\d+)"', content).group(1)
        data_match = re.search("initIntroData\(([^;]+)\);", content)
        if not data_match:
            return self.get_chapter_list_from_api(comic_id)
        datas = json.loads(data_match.group(1))
        chapter_datas = []
        for data in datas:
            chapter_datas += data["data"]
        if not chapter_datas:
            return self.get_chapter_list_from_api(comic_id)
        chapter_datas.sort(key=lambda d: d["id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["id"], comic_id=comic_id)
            chapters.append((chapter["chapter_name"], chapter_url))
        return chapters
Exemplo n.º 18
0
    def POST(self):
        user = self.getcurrentuser(forAjax=True)
        web.header('Content-Type', 'application/json')
        webInput = web.input()
        category = webInput.get('category', '')
        title = webInput.get('title')
        feedUrl = webInput.get("url")
        isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true')
        creator = webInput.get('creator', '')

        if not title or not feedUrl:
            return json.dumps({'status': _("Title or Url is empty!")})

        opener = URLOpener()
        srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                                  SharedLibrarykindleearAppspotCom.__url__)
        data = {
            'category': category,
            'title': title,
            'url': feedUrl,
            'creator': creator,
            'isfulltext': 'true' if isfulltext else 'false',
            'key': 'kindleear.lucky!'
        }
        result = opener.open(srvUrl, data)
        if result.status_code == 200 and result.content:
            return result.content
        else:
            return json.dumps({
                'status':
                _('Cannot submit data to kindleear.appspot.com, status: %s' %
                  URLOpener.CodeMap(result.status_code))
            })
Exemplo n.º 19
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(self.feeds[0][1])
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))
            
            for partition,section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('duplicated, skipped %s' % urlfeed)
                        continue
                        
                    urls.append((section, item['title'], self.url4forwarder(urlfeed), None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%d):%s'%(result.status_code, url))
        return urls

    #def fetcharticle(self, url, opener, decoder):
    #    result = opener.open(self.url4forwarder(url))
    #    status_code, content = result.status_code, result.content
    #    if status_code != 200 or not content:
    #        self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
    #        return None
    #    
    #    if self.page_encoding:
    #        return content.decode(self.page_encoding)
    #    else:
    #        return decoder.decode(content,url,result.headers)
            
Exemplo n.º 20
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url),..] """
     urls = []
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 feed = feedparser.parse(result.content.decode(self.feed_encoding))
             else:
                 feed = feedparser.parse(AutoDecoder().decode(result.content))
             
             urladded = [] # 防止部分RSS产生重复文章
             for e in feed['entries'][:self.max_articles_per_feed]:
                 url = e.link
                 if url not in urladded:
                     if isfulltext:
                         desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary
                         urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?'))
                     else:
                         urls.append((section, e.title, url, None))
                     urladded.append(url)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
Exemplo n.º 21
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://m.733.so" ):
            url = url.replace('https://m.733.so', 'https://www.733.so')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find('div', {"class": "cy_plist"})
        if (soup is None):
            self.log.warn('cy_plist is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://www.733.so" + lias[rindex].get("href")
            chapterList.append((lias[rindex].get_text(), href))

        return chapterList
Exemplo n.º 22
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(url)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".
                format(url, result.status_code, result.content))
            return []

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, "html.parser")
        scripts = soup.findAll("script", {"type": "text/javascript"})
        packed_js = None
        for script in scripts:
            if "newImgs" in script.text:
                packed_js = script.text
                break
        if not packed_js:
            self.log.warn("Can't find js")
            return []
        codes = decode_packed_codes(packed_js)
        return re.findall("'(.+?)'", codes)
Exemplo n.º 23
0
 def SaveToInstapaper(self, user, action, orgUrl):
     web.header('Content-type', "text/html; charset=utf-8")
     
     T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
         <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>"""
     if not user.instapaper_username or not user.instapaper_password:
         info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.')
         return info.encode('utf-8')
     
     title = web.input().get('t', '')
     name = web.input().get("n", '')
     if user.instapaper_username != name:
         info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.')
         return info.encode('utf-8')
         
     opener = URLOpener()
     password = ke_decrypt(user.instapaper_password, user.secret_key or '')
     apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 
                     'selection':'KindleEar', 'url':orgUrl}
     ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Saved to Instapaper', info)
     elif ret.status_code == 403:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     else:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code)
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     
     return info.encode('utf-8')
     
Exemplo n.º 24
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     url = r'http://cctv.cntv.cn/lm/xinwenyijiayi/video/index.shtml'
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     if result.status_code != 200 or not result.content:
         self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
         return []
         
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     
     list_pattern=re.compile(r'{\'title\':\'.*?\'<!--VIDEOSTR-->\'}', re.S)
     file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search
     l=re.findall(list_pattern,content)
     tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
     for i in l[:5]:
         item=eval(i)
         try:
             pubdate = datetime.datetime.strptime(file_name_search(item["link_add"]).group(0), '%Y/%m/%d')
         except Exception as e:
             continue
         delta = tnow - pubdate
         if self.oldest_article > 0 and delta.days > self.oldest_article:
               continue
         urls.append((u'新闻1+1',item['title'],item['link_add'],None))
     return urls
     
Exemplo n.º 25
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        url = url.replace("https://m.tohomh123.com",
                          "https://www.tohomh123.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find("ul", {"id": 'detail-list-select-2'})
        if not soup:
            self.log.warn('chapterList is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if not lias:
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for a in lias:
            href = "https://www.tohomh123.com" + a.get("href")
            chapterList.append((unicode(a.contents[0]), href))

        return chapterList
Exemplo n.º 26
0
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>',
                        re.I)
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
              if self.feed_encoding:
                try:
                  comment = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                  return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        comment_json = mt.group(1) if mt else None
        j = json.loads(comment_json)
        soup = BeautifulSoup(content, "lxml")
        for c in j['comments']:
            u = c['user']['screen_name']
            t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
            for img in t.find_all('img', alt=True):
                img.replace_with(t.new_string(img['alt']))
            soup.html.body.append(t.p)

        content = unicode(soup)
        return content
Exemplo n.º 27
0
    def getImgUrlList(self, url):
        imgUrlList = []
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        sel = soup.find('select') #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return None

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return None

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)
            else:
                href = self.host + '/comic/' + ul.get('value')
                imgUrlList.append(href)

        return imgUrlList
Exemplo n.º 28
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = r'http://opinion.people.com.cn/GB/40604/index.html'
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
            return []
            
        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
        else:
            content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
            
        soup = BeautifulSoup(content, 'lxml')

        box=soup.find('div', attrs={'class':'p2j_list'})
        for li in box.find_all('li'):
            a=li.find('a')
            # print a['href'],a.string
            title = a.string
            if u'人民日报' in title:
                urls.append((u'人民日报',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
            else:
                urls.append((u'人民网',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None))
        return urls
Exemplo n.º 29
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(
            'http://news.at.zhihu.com/api/1.1/news/latest')
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))

            for partition in self.partitions:
                for item in feed[partition]:
                    for e in item['items']:
                        urlfeed = e['share_url']
                        if urlfeed in urladded:
                            self.log.warn('skipped %s' % urlfeed)
                            continue

                        urls.append((self.partitions[partition], e['title'],
                                     urlfeed, None))
                        urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%d):%s' %
                          (result.status_code, url))
        #self.log.warn('%s' % json.dumps(urls))
        return urls
Exemplo n.º 30
0
 def SaveToInstapaper(self, user, action, orgUrl):
     web.header('Content-type', "text/html; charset=utf-8")
     
     T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
         <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>"""
     if not user.instapaper_username or not user.instapaper_password:
         info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.')
         return info.encode('utf-8')
     
     title = web.input().get('t', '')
     name = web.input().get("n", '')
     if user.instapaper_username != name:
         info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.')
         return info.encode('utf-8')
         
     opener = URLOpener()
     password = ke_decrypt(user.instapaper_password, user.secret_key or '')
     apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 
                     'selection':'KindleEar', 'url':orgUrl}
     ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Saved to Instapaper', info)
     elif ret.status_code == 403:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     else:
         info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code)
         info += '<br/><p style="text-align:right;color:red;">by KindleEar &nbsp;</p>'
         info = T_INFO % ('Failed to save', info)
     
     return info.encode('utf-8')
     
Exemplo n.º 31
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(self.feeds[0][1])
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))
            
            for partition,section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('duplicated, skipped %s' % urlfeed)
                        continue
                        
                    urls.append((section, item['title'], self.url4forwarder(urlfeed), None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url))
        return urls

    #def fetcharticle(self, url, opener, decoder):
    #    result = opener.open(self.url4forwarder(url))
    #    status_code, content = result.status_code, result.content
    #    if status_code != 200 or not content:
    #        self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code),url))
    #        return None
    #    
    #    if self.page_encoding:
    #        return content.decode(self.page_encoding)
    #    else:
    #        return decoder.decode(content,url,result.headers)
            
Exemplo n.º 32
0
 def POST(self, verType):
     INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate"
     web.header('Content-Type', 'application/json')
     
     respDict = {'status':'ok', 'correct':0}
     if verType.lower() != 'instapaper':
         respDict['status'] = _('Request type[%s] unsupported') % verType
         return json.dumps(respDict)
     
     user = self.getcurrentuser()
     
     username = web.input().get('username', '')
     password = web.input().get('password', '')
     opener = URLOpener()
     apiParameters = {'username': username, 'password':password}
     ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         respDict['correct'] = 1
     elif ret.status_code == 403:
         respDict['correct'] = 0
     else:
         respDict['status'] = _("The Instapaper service encountered an error. Please try again later.")
     
     return json.dumps(respDict)
     
Exemplo n.º 33
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://www.manhuagui.com" ):
            url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        invisible_input = soup.find("input", {"id":'__VIEWSTATE'})
        if invisible_input:
            lz_encoded=invisible_input.get("value")
            lz_decoded = decompressFromBase64(lz_encoded)
            soup = BeautifulSoup(lz_decoded, 'html.parser')
        else:
            soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'})

        lias = soup.findAll('a')
        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://m.manhuagui.com" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
Exemplo n.º 34
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        scripts = soup.findAll("script", {"type": "text/javascript"})
        for script in scripts:
            if script.text != "":
                raw_content = script.text
                break

        res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1)
        lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1)
        lz_decoded = decompressFromBase64(lz_encoded)
        res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res)
        codes = self.get_node_online(res)
        pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1))

        cid = self.getChapterId(url)
        md5 = pages_opts["sl"]["md5"]
        images = pages_opts["images"]
        for img in images:
            img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5)
            imgList.append(img_url)

        return imgList
Exemplo n.º 35
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host)
        chapterList = []

        url = url.replace("http://www.dm5.com", "https://www.manhuaren.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".
                format(url, result.status_code, result.content))
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, "html.parser")

        chapter_datas = []
        for link in soup.find_all("a", {"class": "chapteritem"}):
            chapter_datas.append({
                "chapter_id":
                int(re.search("m(\d+)", link.get("href")).group(1)),
                "chapter_title":
                unicode(link.string),
            })
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        for chapter in chapter_datas:
            chapter_url = "http://www.manhuaren.com/m{}/".format(
                chapter["chapter_id"])
            chapterList.append((chapter["chapter_title"], chapter_url))
        return chapterList
Exemplo n.º 36
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.utcnow()
        urladded = set()
        for feed in self.feeds:
            section, url = feed[0], feed[1]
            partition = feed[2]
            timeout = 30
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                feed = json.loads(result.content.decode(self.feed_encoding))

                for item in feed[partition]:
                    for e in item['items']:
                        #支持HTTPS
                        urlfeed = e['share_url'].replace('http://','https://') if url.startswith('https://') else e['share_url']
                        if urlfeed in urladded:
                            self.log.warn('skipped %s' % urlfeed)
                            continue
                            
                        desc = None
                        urls.append((section, e['title'], urlfeed, desc))
                        urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
        #self.log.warn('%s' % json.dumps(urls))
        return urls
Exemplo n.º 37
0
    def getImgUrlList(self, url):
        imgUrlList = []
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.page_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        sel = soup.find('select')  #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return None

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return None

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)
            else:
                href = self.host + '/comic/' + ul.get('value')
                imgUrlList.append(href)

        return imgUrlList
Exemplo n.º 38
0
    def get_chapter_list_from_mobile_url(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return []

        content = self.AutoDecodeContent(
            result.content, decoder, self.feed_encoding, opener.realurl, result.headers
        )

        if "obj_id" not in content:
            self.log.warn(u"Can't find obj_id form {}".format(url))
            return []

        comic_id = re.search('obj_id = "(\d+)"', content).group(1)
        data_match = re.search("initIntroData\(([^;]+)\);", content)
        if not data_match:
            return self.get_chapter_list_from_api(comic_id)
        datas = json.loads(data_match.group(1))
        chapter_datas = []
        for data in datas:
            chapter_datas += data["data"]
        if not chapter_datas:
            return self.get_chapter_list_from_api(comic_id)
        chapter_datas.sort(key=lambda d: d["id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["id"], comic_id=comic_id
            )
            chapters.append((chapter["chapter_name"], chapter_url))
        return chapters
Exemplo n.º 39
0
def BaiduPanHandler(url):
    import json
    o = urlparse.urlparse(url)
    if not o[1] or not o[1].endswith(('pan.baidu.com','yun.baidu.com')):
        return None
    
    #为了简单起见,这里使用网友制作的网站获取真实链接
    #后续为了减少依赖,可以借鉴
    #https://github.com/banbanchs/pan-baidu-download
    #和 https://github.com/xuanqinanhai/bleed-baidu-white
    #将代码集成过来
    url = 'http://daimajia.duapp.com/baidu/?url=%s' % url
    opener = URLOpener()
    result = opener.open(url)
    if result.status_code != 200 or not result.content:
        return None
    linkinfo = json.loads(result.content.decode('utf-8'))
    filename = linkinfo.get('name')
    if '\u' in filename:
        try:
            filename = filename.decode('unicode-escape')
        except:
            pass
    link = linkinfo.get('download')
    
    return (filename,link) if link else None
Exemplo n.º 40
0
 def POST(self, verType):
     INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate"
     web.header('Content-Type', 'application/json')
     
     respDict = {'status':'ok', 'correct':0}
     if verType.lower() != 'instapaper':
         respDict['status'] = _('Request type[%s] unsupported') % verType
         return json.dumps(respDict)
     
     user = self.getcurrentuser()
     
     username = web.input().get('username', '')
     password = web.input().get('password', '')
     opener = URLOpener()
     apiParameters = {'username': username, 'password':password}
     ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters)
     if ret.status_code in (200, 201):
         respDict['correct'] = 1
     elif ret.status_code == 403:
         respDict['correct'] = 0
     else:
         respDict['status'] = _("The Instapaper service encountered an error. Please try again later.")
     
     return json.dumps(respDict)
     
Exemplo n.º 41
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "http://" ):
            url = url.replace('http://', 'https://')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        allComicTable = soup.find_all('table', {'width': '800', 'align': 'center'})

        if (allComicTable is None):
            self.log.warn('allComicTable is not exist.')
            return chapterList

        for comicTable in allComicTable:
            comicVolumes = comicTable.find_all('a', {'target': '_blank'})
            if (comicVolumes is None):
                self.log.warn('comicVolumes is not exist.')
                return chapterList

            for volume in comicVolumes:
                href = self.urljoin(self.host, volume.get("href"))
                chapterList.append((unicode(volume.string), href))

        return chapterList
Exemplo n.º 42
0
    def fetcharticle(self, url, decoder):
        """链接网页获取一篇文章"""
        if self.fulltext_by_instapaper and not self.fulltext_by_readability:
            url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)

        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.status_code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if 0:  #有些网站封锁GAE,将GAE获取的网页发送到自己邮箱调试
            from google.appengine.api import mail
            mail.send_mail(SRC_EMAIL,
                           SRC_EMAIL,
                           "KindleEar Debug",
                           "KindlerEar",
                           attachments=[
                               ("Page.html", content),
                           ])

        if self.page_encoding:
            try:
                return content.decode(self.page_encoding)
            except UnicodeDecodeError:
                return decoder.decode(content, opener.realurl)
        else:
            return decoder.decode(content, opener.realurl)
Exemplo n.º 43
0
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I)
        comment = ''
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        comment = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        if mt:
            comment_json = mt.group(1)
            j = json.loads(comment_json)
            soup = BeautifulSoup(content, "lxml")
            for c in j['comments']:
                u = c['user']['screen_name']
                t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
                for img in t.find_all('img', alt=True):
                    img.replace_with(t.new_string(img['alt']))
                soup.html.body.append(t.p)

            content = unicode(soup)
        return content
Exemplo n.º 44
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'qTcms_S_m_murl_e="(.*)";', content).group()
        if (res is None):
            self.log.warn(content)
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            imgList.append(img)

        return imgList
Exemplo n.º 45
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.index('{'):content.index('}')+1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))

        return urls
Exemplo n.º 46
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ((u"mh" in urlpaths)
                and (urlpaths.index(u"mh") + 2 < len(urlpaths))):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh") + 1]
            pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/",
                                     "") + '|{}|{}|{}|pc'.format(
                                         tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
Exemplo n.º 47
0
 def SendNewSubscription(self, title, url):
     opener = URLOpener()
     path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/')
     path[-1] = 'subscribedfromshared'
     srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                               '/'.join(path))
     data = {'title': title, 'url': url}
     result = opener.open(srvUrl, data)  #只管杀不管埋,不用管能否成功了
Exemplo n.º 48
0
 def ParseFeedUrls(self):
     urls = [] #用于返回
     
     newComicUrls = self.GetNewComic() #返回[(title, num, url),...]
     if not newComicUrls:
         return []
     
     decoder = AutoDecoder(isfeed=False)
     for title, num, url in newComicUrls:
         opener = URLOpener(self.host, timeout=60)
         result = opener.open(url)
         if result.status_code != 200 or not result.content:
             self.log.warn('fetch comic page failed: %s' % url)
             continue
             
         content = result.content
         content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
         
         bodySoup = BeautifulSoup(content, 'lxml')
         sel = bodySoup.find('select') #页码行,要提取所有的页面
         ul = sel.find_all('option') if sel else None
         if not ul:
             continue
             
         for comicPage in ul:
             href = comicPage.get('value')
             if href:
                 pageHref = self.urljoin(url, href)
                 result = opener.open(pageHref)
                 if result.status_code != 200:
                     self.log.warn('fetch comic page failed: %s' % pageHref)
                     continue
                     
                 content = result.content
                 content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
                 soup = BeautifulSoup(content, 'lxml')
                 
                 comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
                 comicSrc = comicImgTag.get('src') if comicImgTag else None
                 if comicSrc:
                     urls.append((title, comicPage.text, comicSrc, None))
         
         self.UpdateLastDelivered(title, num)
         
     return urls
Exemplo n.º 49
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     tnow = datetime.datetime.utcnow()
     urladded = set()
     
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 try:
                     content = result.content.decode(self.feed_encoding)
                 except UnicodeDecodeError:
                     content = AutoDecoder(True).decode(result.content,url)
             else:
                 content = AutoDecoder(True).decode(result.content,url)
             feed = feedparser.parse(content)
             
             for e in feed['entries'][:self.max_articles_per_feed]:
                 updated = None
                 if hasattr(e, 'updated_parsed') and e.updated_parsed:
                     updated = e.updated_parsed
                 elif hasattr(e, 'published_parsed') and e.published_parsed:
                     updated = e.published_parsed
                 elif hasattr(e, 'created_parsed'):
                     updated = e.created_parsed
                     
                 if self.oldest_article > 0 and updated:
                     delta = tnow - datetime.datetime(*(updated[0:6]))
                     if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                         self.log.info("Skip old article: %s" % e.link)
                         continue
                 
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed in urladded:
                     continue
                     
                 desc = None
                 if isfulltext:
                     if hasattr(e, 'description'):
                         desc = e.description
                     elif hasattr(e, 'content') and e.content[0]['value']:
                         desc = e.content[0]['value']
                     else:
                         self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title)
                 urls.append((section, e.title, urlfeed, desc))
                 urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     
     return urls
Exemplo n.º 50
0
    def GetNewComic(self):
        urls = []

        if not self.feeds:
            return []

        userName = self.UserName()
        decoder = AutoDecoder(isfeed=False)
        for item in self.feeds:
            title, url = item[0], item[1]

            lastCount = LastDelivered.all().filter(
                'username = '******'These is no log in db LastDelivered for name: %s, set to 0'
                    % title)
                oldNum = 0
            else:
                oldNum = lastCount.num

            opener = URLOpener(self.host, timeout=60)
            result = opener.open(url)
            if result.status_code != 200:
                self.log.warn(
                    'fetch index page for %s failed[%s] : %s' %
                    (title, URLOpener.CodeMap(result.status_code), url))
                continue
            content = result.content
            content = self.AutoDecodeContent(content, decoder,
                                             self.feed_encoding,
                                             opener.realurl, result.headers)

            soup = BeautifulSoup(content, 'lxml')

            allComicTable = soup.find_all('table', {'width': '688'})
            addedForThisComic = False
            for comicTable in allComicTable:
                comicVolumes = comicTable.find_all('a', {'target': '_blank'})
                for volume in comicVolumes:
                    texts = volume.text.split(' ')
                    if len(texts) > 2 and texts[1].isdigit() and volume.get(
                            'href'):
                        num = int(texts[1])
                        if num > oldNum:
                            oldNum = num
                            href = self.urljoin(self.host, volume.get('href'))
                            urls.append((title, num, href))
                            addedForThisComic = True
                            break  #一次只推送一卷(有时候一卷已经很多图片了)

                if addedForThisComic:
                    break

        return urls
Exemplo n.º 51
0
class Pocket(object):
    def __init__(self, consumer_key, redirect_uri=None):
        self.consumer_key = str(consumer_key)
        self.redirect_uri = redirect_uri
        self.access_token = None
        self.opener = URLOpener(headers=POCKET_HEADERS)

    def _post(self, method_url, **kw):
        ret = self.opener.open(method_url, data=json.dumps(kw))
        if ret.status_code != 200 or not ret.content:
            raise APIError(
                ret.status_code, ret.headers.get("X-Error-Code", ""), ret.headers.get("X-Error", ""), "Get access token"
            )
        return json.loads(ret.content)

    def _authenticated_post(self, method_url, **kw):
        kw["consumer_key"] = self.consumer_key
        kw["access_token"] = self.access_token
        return self._post(method_url, **kw)

    def get_request_token(self):
        # 此步仅用来直接通过一次http获取一个request_token(code),pocket不会回调redirect_uri
        ret = self._post(REQUEST_TOKEN_URL, consumer_key=self.consumer_key, redirect_uri=self.redirect_uri)
        return ret.get("code", "")

    def get_authorize_url(self, code):
        if not self.redirect_uri:
            raise APIError(400, "140", "Missing redirect url.", "Get access token")
        url = AUTH_TOKEN_URL % {"request_token": code, "redirect_uri": self.redirect_uri}
        return url

    def get_access_token(self, code):
        # access token : {"access_token":"dcba4321-dcba-4321-dcba-4321dc","username":"******"}.
        ret = self._post(ACCESS_TOKEN_URL, consumer_key=self.consumer_key, code=code)
        self.access_token = ret.get("access_token", "")
        return ret

    def set_access_token(self, access_token):
        self.access_token = str(access_token)

    def add(self, **kw):
        # 需要的参数:
        # url : 要保存的URL,最好先urlencode
        # title  : 可选,如果保存的是一个图片或PDF,则需要,否则不需要
        # tags : 可选,逗号分隔的字符串列表
        # tweet_id  : 可选,用于发推的ID
        # 返回一个字典,包含的键可能有:https://getpocket.com/developer/docs/v3/add
        return self._authenticated_post("https://getpocket.com/v3/add", **kw)

    def get(self, **kw):
        return self._authenticated_post("https://getpocket.com/v3/get", **kw)

    def modify(self, **kw):
        pass
Exemplo n.º 52
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ( (u"mh" in urlpaths) and (urlpaths.index(u"mh")+2 < len(urlpaths)) ):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh")+1]
            pid = urlpaths[urlpaths.index(u"mh")+2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
Exemplo n.º 53
0
    def get_image_list_from_api(self, url):
        comic_id, chapter_id = re.search(r"(\d+)/(\d+)\.html", url).groups()
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(
            "http://v3api.dmzj.com/chapter/{comic_id}/{chapter_id}.json".format(
                comic_id=comic_id, chapter_id=chapter_id
            )
        )
        if result.status_code != 200:
            self.log.info("fetch v3 api json failed: %s, try v2" % result.status_code)
            result = opener.open(
                "http://v2.api.dmzj.com/chapter/{comic_id}/{chapter_id}.json?channel=Android&version=2.6.004".format(
                    comic_id=comic_id, chapter_id=chapter_id
                )
            )
            if result.status_code != 200:
                self.log.warn("fetch v2 api json failed: %s" % result.status_code)
                return []

        data = json.loads(result.content)
        return data["page_url"]
Exemplo n.º 54
0
 def fetcharticle(self, url, decoder):
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(self.url4forwarder(url))
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     if self.page_encoding:
         return content.decode(self.page_encoding)
     else:
         return decoder.decode(content,url)