Пример #1
0
 def ParseFeedUrls(self):
     #return lists like [(section,title,url,desc),..]
     main = 'http://www.reuters.com/places/north-korea'
     urls = []
     isEST = False #判断是EST还是EDT
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #开始解析
     section=soup.find('div', attrs={'class':'topStory'})
     toparticle = section.find('a', href=True)
     if toparticle is None:
         self.log.warn('Top news not found')
     toptitle = string_of_tag(toparticle).strip()
     if not toptitle:
         self.log.warn('No top story title')
     url = toparticle['href']
     if url.startswith(r'/'):
         url = 'http://www.reuters.com' + url
     urls.append(('Reuters North Korea',toptitle,url,None))
         
     sect=soup.find('div', id='moreSectionNews')
     for feature in sect.find_all('div', attrs={'class':'feature'}):
         article = feature.find('a', href=True)
         title = string_of_tag(article).strip()
         url = article['href']
         timestamp = feature.find('span', attrs={'class':'timestamp'})
         if not timestamp:
             continue
         timestamp = string_of_tag(timestamp).strip()
         #今天的文章
         if 'EDT' in timestamp or 'EST' in timestamp:
             delta=0
             if 'EST' in timestamp:
                 isEST=True
         else:
             pubtime = datetime.datetime.strptime(timestamp, '%b %d %Y').date()
             #默认为EDT
             tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=4)
             currentmonth= tnow.month
             if currentmonth in [1, 2, 12] or isEST:
                 tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=5)
             tnow = tnow.date()
             delta=(tnow-pubtime).days
         if self.oldest_article > 0 and delta > self.oldest_article:
             continue
         if url.startswith(r'/'):
             url = 'http://www.reuters.com' + url
             #self.log.info('\tFound article:%s' % title)
         urls.append(('Reuters North Korea',title,url,None))
                             
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Пример #2
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://www.thepaper.cn/list_masonry.jsp?nodeid=26878'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('div', class_='news_li', limit=6):
            inter = article.find('div', class_='pdtt_trbs')
            timestamp = inter.find('span')
            timestamp = string_of_tag(timestamp).strip()
            if u'天' in timestamp or u'-' in timestamp:
                continue
            h2 = article.find('h2')
            a = h2.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith(r'news'):
                url = 'http://www.thepaper.cn/' + url
            urls.append((u'上海书评', title, url, None))
        if len(urls) == 0:
            self.log.warn('No article found for Shanghai Book Review.')
        return urls
Пример #3
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://www.jintiankansha.me/column/nwClF5ZmDJ'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        section = soup.find('div', class_='entries')
        for article in section.find_all('div', class_='cell item', limit=10):
            timestamp = article.find('span', class_='small fade')
            timestamp = string_of_tag(timestamp).strip()
            #if u'小时' not in timestamp and u'昨天' not in timestamp:
            if u'小时' not in timestamp:
                continue
            span = article.find('span', class_='item_title')
            a = span.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            urls.append((u'聊聊架构', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #4
0
 def ParseFeedUrls(self):
     main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F'
     urls = []
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     soup = BeautifulSoup(content, "lxml")
     for section in soup.find_all('dl'):
         dt=section.find('dt')
         span=dt.find('span')
         if span:
             sectitle = string_of_tag(span).strip()
         for dd in section.find_all('dd'):
             a=dd.find('a', href=True)
             title = string_of_tag(a).strip()
             url = a['href']
             if url.startswith('Article'):
                 url = 'http://bbstsg.vip.qikan.com/text/'+url
             urls.append((sectitle,title,url,None))
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Пример #5
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://mp.sohu.com/profile?xpt=bWhtaW5nMUBzb2h1LmNvbQ==&_f=index_pagemp_1'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('div', class_='content_wrap', limit=6):
            timestamp = article.find('div', class_='wrap_mark')
            span = timestamp.find('span')
            timestamp = string_of_tag(span).strip()
            if u'今天' not in timestamp and u'昨天' not in timestamp:
                continue
            div = article.find('div', class_='wrap_title')
            a = span.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith('/'):
                url = 'http:' + url
            urls.append((u'古代小说网sohu', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #6
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'https://www.nknews.org/'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        def is_cls_wanted(css_class):
            listwanted = [
                'col-md-7', 'post-prinicap-row', 'col-md-12',
                'col-md-6 smallboxclass'
            ]
            return css_class in listwanted
#        def not_has_class(tag):
#            return not tag.has_attr('class')

        for section in soup.find_all(class_=is_cls_wanted, limit=8):
            article = section.find('a', string=True)
            title = string_of_tag(article).strip()
            url = article['href']
            if '/pro/' in url:
                continue
            span = article.find('span')
            strong = span.find('strong')
            if not strong:
                timestamp = span
            else:
                timestamp = strong
            timestamp = string_of_tag(timestamp).strip()
            m = re.search(r'\d{4}$', timestamp)
            if m:
                pubtime = datetime.datetime.strptime(timestamp,
                                                     '%d %B %Y').date()
            else:
                m2 = re.search(r'^\d', timestamp)
                if m2:
                    pubtime = datetime.datetime.strptime(timestamp,
                                                         '%d %B').date()
                else:
                    pubtime = datetime.datetime.strptime(timestamp,
                                                         '%B %d').date()

            tnow = datetime.datetime.utcnow()
            tnow = tnow.date()
            delta = (tnow - pubtime).days
            if self.oldest_article > 0 and delta > self.oldest_article:
                continue
                #self.log.info('\tFound article:%s' % title)
            urls.append(('NK News', title, url, None))
        if len(urls) == 0:
            self.log.warn('NK News has no article.')
        return urls
Пример #7
0
 def ParseFeedUrls(self):
     #return list like [(section,title,url,desc),..]
     main = 'https://www.economist.com/printedition'
     # Did you block me?
     main = self.url4forwarder(main)
     urls = []
     urladded = set()
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #开始解析
     for section in soup.find_all('li', attrs={'class':'list__item'}):
         div = section.find('div')
         if div is None:
             self.log.warn('This part skipped.')
             continue
         sectitle = string_of_tag(div).strip()
         if not sectitle:
             self.log.warn('No section title')
             continue
         if sectitle == 'Economic and financial indicators':
             continue
         #self.log.info('Found section: %s' % section_title)
         articles = []
         for node in section.find_all('a', href=True):
             spans = node.findAll('span')
             if len(spans) == 2:
                 fly= node.find('span', attrs={'class':'print-edition__link-flytitle'})
                 pre= string_of_tag(fly).strip()
                 ti= node.find('span', attrs={'class':'print-edition__link-title'})
                 post= string_of_tag(ti).strip()
                 title = pre +': '+ post
             else:
                 title = string_of_tag(node).strip()
             url = node['href']
             if url.startswith(r'/'):
                 url = 'http://www.economist.com' + url
                 # Did you block me?
                 url = self.url4forwarder(url)
                 #self.log.info('\tFound article:%s' % title)
                 if url not in urladded:
                     urls.append((sectitle,title,url,None))
                     urladded.add(url)
                             
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Пример #8
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        mainhead = 'https://www.yna.co.kr/international/china/'
        num = 1
        urls = []
        callitaday = False
        koreanow = datetime.datetime.utcnow() + datetime.timedelta(hours=9)
        #        koreadate = koreanow.date()
        year = koreanow.year
        mydelta = datetime.timedelta(hours=24, minutes=10)

        while not callitaday:
            main = mainhead + str(num)
            opener = URLOpener(self.host, timeout=90)
            result = opener.open(main)
            if result.status_code != 200:
                self.log.warn('fetch mainnews failed:%s' % main)

            content = result.content.decode(self.page_encoding)
            soup = BeautifulSoup(content, "lxml")
            #开始解析

            section = soup.find('div', class_='list-type038')
            for article in section.find_all('div', class_='item-box01'):
                if article is None:
                    self.log.warn('This article not found')
                    continue
                ptime = article.find('span', class_='txt-time')
                if ptime:
                    ptime = string_of_tag(ptime).strip()
                    #                    pdate=ptime[0:5] #只要07-30这样的日期
                    ptime = str(year) + '-' + ptime  #加上年份,否则默认1900年
                    ptime = datetime.datetime.strptime(ptime, '%Y-%m-%d %H:%M')
                    delta = koreanow - ptime
                    #                    if self.oldest_article > 0 and delta >= self.oldest_article:
                    if delta > mydelta:
                        callitaday = True
                        break  #因为是按时间顺序的
                newscon = article.find('div', class_='news-con')
                a = newscon.find('a', href=True)
                atitle = string_of_tag(a).strip()
                atitle = atitle + ' ' + str(ptime)[5:-3]
                url = a['href']
                if url.startswith('/'):
                    url = 'https:' + url
                urls.append((u'중국 뉴스', atitle, url, None))
            num = num + 1
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #9
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = 'http://www.economist.com/printedition'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.status_code != 200:
            self.log.warn('fetch rss failed:%s' % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #GAE获取到的是移动端网页,和PC获取到的网页有些不一样
        for section in soup.find_all(
                'section', attrs={'id': lambda x: x and 'section' in x}):
            h4 = section.find('h4')
            if h4 is None:
                self.log.warn('h4 is empty')
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('h4 string is empty')
                continue
            #self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ''
            for node in section.find_all('article'):
                subsec = node.find('h5')
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a['href']
                    if url.startswith(r'/'):
                        url = 'http://www.economist.com' + url
                    url += '/print'
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        #self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #10
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     mainurl = 'http://www.economist.com/printedition'
     urls = []
     urladded = set()
     opener = URLOpener(self.host, timeout=30)
     result = opener.open(mainurl)
     if result.status_code != 200:
         self.log.warn('fetch rss failed:%s'%mainurl)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #GAE获取到的是移动端网页,和PC获取到的网页有些不一样
     for section in soup.find_all('section', attrs={'id':lambda x: x and 'section' in x}):
         h4 = section.find('h4')
         if h4 is None:
             self.log.warn('h4 is empty')
             continue
         sectitle = string_of_tag(h4).strip()
         if not sectitle:
             self.log.warn('h4 string is empty')
             continue
         #self.log.info('Found section: %s' % section_title)
         articles = []
         subsection = ''
         for node in section.find_all('article'):
             subsec = node.find('h5')
             if subsec is not None:
                 subsection = string_of_tag(subsec)
             prefix = (subsection + ': ') if subsection else ''
             a = node.find('a', attrs={"href":True}, recursive=False)
             if a is not None:
                 url = a['href']
                 if url.startswith(r'/'):
                     url = 'http://www.economist.com' + url
                 url += '/print'
                 title = string_of_tag(a)
                 if title:
                     title = prefix + title
                     #self.log.info('\tFound article:%s' % title)
                     if url not in urladded:
                         urls.append((sectitle,title,url,None))
                         urladded.add(url)
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
     
Пример #11
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = "http://www.economist.com/printedition"
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.status_code != 200:
            self.log.warn("fetch rss failed:%s" % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        # GAE获取到的是移动端网页,和PC获取到的网页有些不一样
        for section in soup.find_all("section", attrs={"id": lambda x: x and "section" in x}):
            h4 = section.find("h4")
            if h4 is None:
                self.log.warn("h4 is empty")
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn("h4 string is empty")
                continue
            # self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ""
            for node in section.find_all("article"):
                subsec = node.find("h5")
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ": ") if subsection else ""
                a = node.find("a", attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a["href"]
                    if url.startswith(r"/"):
                        url = "http://www.economist.com" + url
                    url += "/print"
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        # self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn("len of urls is zero.")
        return urls
Пример #12
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://news.joins.com/Issue/10061'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('strong', class_='headline mg',
                                     limit=4):  #只保留最近一个月的四篇
            a = article.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith('/'):
                url = 'http://news.joins.com' + url
            urls.append((u'사설 속으로', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #13
0
 def FindHo():
     hopage = 'http://weekly.chosun.com/client/contents/lst.asp'
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(hopage)
     content = result.content.decode('euc-kr')
     if result.status_code != 200:
         self.log.warn('fetching hopage failed:%s' % hopage)
     soup = BeautifulSoup(content, "lxml")
     location = soup.find('div', id='Location')
     edition = location.find('div', class_='edition')
     ho = string_of_tag(edition).strip()
     if ho.startswith('['):
         ho = ho[1:5]
     else:
         self.log.warn('Fetching ho failed.')
     return ho
Пример #14
0
    def ParseFeedUrls(self):
        datetime_t = str(datetime.date.today()).split(
            '-')  #对日期进行拆分,返回一个['2017', '10', '09']形式的列表
        #return lists like [(section,title,url,desc),..]
        # main = 'http://csr.mos.gov.cn/content/1/'
        mainurl = 'http://csr.mos.gov.cn/content/' + datetime_t[
            0] + '-' + datetime_t[1] + '/' + datetime_t[2] + '/'  #url前缀带日期
        #mainurl = 'http://csr.mos.gov.cn/content/' + datetime_t[0] + '-' + datetime_t[1] + '/' + datetime_t[2] + '/' + 'node_2.htm' #头版完整url
        ans = []
        #urladded = set()
        # opener = URLOpener(self.host, timeout=90)
        # result = opener.open(mainurl + 'node_2.htm')
        soup1 = self.page_to_soup(mainurl + 'node_2.htm')
        #if result.status_code != 200:
        #    self.log.warn('fetch mainnews failed:%s'%mainurl)

        # content = result.content.decode(self.page_encoding)
        # soup = BeautifulSoup(content, "lxml")

        #开始解析
        mulu = soup1.find('td', {'class': 'mulu04'})
        for banmian in mulu.find_all('a'):
            articles = []
            if 'pdf' in banmian['href']:
                continue
            wenzhangliebiao = self.page_to_soup(mainurl + banmian['href'])
            vol_title = banmian.contents[0].strip()
            ul = wenzhangliebiao.find('ul', {'class': 'list01'})  #抓取的正文链接框架部分

            for link in ul.find_all('a'):
                til = string_of_tag(link)
                url = mainurl + link['href']
                desc = ''
                #r = .find({'class':'title01'})
                #if r is not None:
                #    desc = self.tag_to_string(r)
                # wz = {'fTitle':til, 'url' : url}
                #self.log.warn('href为:%s'%url)
                #articles.append(wz)

                # ans0 = (vol_title, wz)
                ans.append((vol_title, til, url, None))
                #urladded.add(url)

        if len(ans) == 0:
            self.log.warn('len of urls is zero.')
        return ans
Пример #15
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        for feed in self.feeds:
            feedtitle,url = feed[0],feed[1]
            opener = URLOpener(self.host, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code != 200 or not result.content:
                self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url))
                continue

            if self.feed_encoding:
                try:
                    content = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                    content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
            else:
                content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)

            soup = BeautifulSoup(content, 'lxml')
            for article in soup.find_all('div', attrs={'class':'feed_item_question'}):
                title = article.find('a', attrs={'class':'question_link'})
                if not title:
                    continue

                #获取发布时间
                pubdate = article.find('span',attrs={'class':'timestamp'})
                if not pubdate:
                    continue
                try:
                    pubdate = datetime.datetime.strptime(pubdate.string, '%Y-%m-%d')
                except Exception as e:
                    self.log.warn('parse pubdate failed for [%s] : %s'%(url,str(e)))
                    continue

                #确定文章是否需要推送,时区固定为北京时间
                tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8)
                delta = tnow - pubdate
                if self.oldest_article > 0 and delta.days > self.oldest_article:
                    continue

                href = title['href'] if title['href'].startswith('http') else self.urljoin(url,title['href'])

                urls.append((feedtitle,string_of_tag(title),href,None))

        return urls
Пример #16
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'https://www.yna.co.kr/nk/index'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch mainnews failed:%s' % main)

        content = result.content.decode(self.page_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        section = soup.find('section',
                            attrs={'class': 'column-type01 column-newslist'})
        for article in section.find_all('article'):
            if article is None:
                self.log.warn('This article not found')
                continue
            h2 = article.find('h2')
            a = h2.find('a', href=True)
            atitle = string_of_tag(a).strip()
            url = a['href']
            if url.startswith('/'):
                url = 'https:' + url
            elif url.startswith('HTTP'):
                url = url.replace('HTTP', 'http')
            if url not in urladded:
                urls.append((u'韩联社朝鲜要闻', atitle, url, None))
                urladded.add(url)
            related = article.find('div', attrs={'class': 'v-related'})
            if related:
                span = related.find('span')
                if span:
                    relateda = span.find('a', href=True)
                    rtitle = string_of_tag(relateda).strip()
                    rtitle = 'Related: ' + rtitle  #在相关文章标题前加标志
                    rurl = relateda['href']
                    if rurl.startswith('/'):
                        rurl = 'https:' + rurl
                    elif rurl.startswith('HTTP'):
                        rurl = rurl.replace('HTTP', 'http')
                    if rurl not in urladded:
                        urls.append((u'韩联社朝鲜要闻', rtitle, rurl, None))
                        urladded.add(rurl)

        part2 = 'https://www.yna.co.kr/nk/news/all'
        opener2 = URLOpener(self.host, timeout=90)
        result2 = opener2.open(part2)
        if result2.status_code != 200:
            self.log.warn('fetch latest news failed:%s' % main)
        content2 = result2.content.decode(self.page_encoding)
        soup2 = BeautifulSoup(content2, "lxml")
        sect = soup2.find('ul', attrs={'class': 'list-type01 yna-more'})
        for arti in sect.find_all('article'):
            h = arti.find('h2')
            a2 = h.find('a', href=True)
            title = string_of_tag(a2).strip()
            if u'[북한날씨]' in title:
                continue
            aurl = a2['href']
            if aurl.startswith('/'):
                aurl = 'https:' + aurl
            elif aurl.startswith('HTTP'):
                aurl = aurl.replace('HTTP', 'http')
            if aurl not in urladded:
                urls.append((u'朝鲜最新消息', title, aurl, None))
                urladded.add(aurl)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Пример #17
0
    def ParseFeedUrls(self):
        #return list like [(section,title,url,desc),..]
        def FindHo():
            hopage = 'http://weekly.chosun.com/client/contents/lst.asp'
            opener = URLOpener(self.host, timeout=90)
            result = opener.open(hopage)
            content = result.content.decode('euc-kr')
            if result.status_code != 200:
                self.log.warn('fetching hopage failed:%s' % hopage)
            soup = BeautifulSoup(content, "lxml")
            location = soup.find('div', id='Location')
            edition = location.find('div', class_='edition')
            ho = string_of_tag(edition).strip()
            if ho.startswith('['):
                ho = ho[1:5]
            else:
                self.log.warn('Fetching ho failed.')
            return ho

        mainhead = 'http://weekly.chosun.com/client/news/alllst.asp?nHo='
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        ho = FindHo()
        main = mainhead + ho
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('Fetching TOC failed:%s' % main)
            return []
        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        def tr_has_a_tag(tag):
            return tag.name == 'tr' and tag.find('a')

        listarea = soup.find('div', class_='List_area')
        for section in listarea.find_all('table'):
            h4 = section.find_previous_sibling('h4')
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('No section title')
                continue
#            if sectitle == 'Economic and financial indicators':
#                continue
#self.log.info('Found section: %s' % section_title)
            articles = []
            for tr in section.find_all(tr_has_a_tag):
                article = tr.find('a', href=True)
                title = string_of_tag(article).strip()
                url = article['href']
                if url.startswith('viw'):
                    url = 'http://weekly.chosun.com/client/news/' + url
                    url = url.replace('viw', 'print', 1)
                    #self.log.info('\tFound article:%s' % title)
                    if url not in urladded:
                        urls.append((sectitle, title, url, None))
                        urladded.add(url)

        if len(urls) == 0:
            self.log.warn('No articles found for WeeklyChosun.')
        return urls
Пример #18
0
    def ParseFeedUrls(self):
        #return list like [(section,title,url,desc),..]
        login_url = 'https://my.economist.com/'
        main = 'https://www.economist.com/weeklyedition'
        #        login_form = {"css-1gytnsx":self.account, "password":self.password}
        #        login_response = opener.open(login_url, data=login_form)
        #        main = 'https://www.economist.com/'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []
#        content = result.content.decode(self.feed_encoding)
#        soup = BeautifulSoup(content, "lxml")
#        a = soup.find('a', attrs={'class':'latest-printed__cta'})
#        current = a['href']
#        if current.startswith(r'/'):
#            current = 'https://www.economist.com' + url
#        opener = URLOpener(self.host, timeout=90)
#        result = opener.open(current)
#        if result.status_code != 200:
#            self.log.warn('fetch latest edition failed:%s'%main)
#            return []
        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        #        for section in soup.find_all('li', attrs={'class':'list__item'}):
        #            div = section.find('div')
        #            if div is None:
        #                self.log.warn('This part skipped.')
        #                continue
        thisweek = soup.find('div', class_='layout-weekly-edition-wtw')
        if thisweek:
            h2 = thisweek.find('h2')
            sectitle = string_of_tag(h2).strip()
            if not sectitle:
                self.log.warn('No section title for the world this week')
            for week in thisweek.find_all('a', href=True):
                title = string_of_tag(week).strip()
                url = week['href']
                if url.startswith(r'/'):
                    url = 'https://www.economist.com' + url
                urls.append((sectitle, title, url, None))
        else:
            self.log.warn('The world this week not found.')

        for section in soup.find_all(
                class_=lambda value: value and value.startswith(
                    'layout-weekly-edition-section')):
            h2 = section.find('h2')
            sectitle = string_of_tag(h2).strip()
            if not sectitle:
                self.log.warn('No section title')
                continue
            if 'financial indicators' in sectitle:
                continue
            #self.log.info('Found section: %s' % section_title)
#            articles = []
            for node in section.find_all('a',
                                         href=True,
                                         class_=lambda value: value and value.
                                         startswith('headline-link')):
                spans = node.find_all('span')
                if len(spans) == 2:
                    title = u'{}: {}'.format(*map(string_of_tag, spans))
#            for node in section.find_all('a', href=True):
#                spans = node.findAll('span')
#                if len(spans) == 2:
#                    fly= node.find('span', attrs={'class':'print-edition__link-flytitle'})
#                    pre= string_of_tag(fly).strip()
#                    ti= node.find('span', attrs={'class':'print-edition__link-title'})
#                    post= string_of_tag(ti).strip()
#                    title = pre +': '+ post
                else:
                    title = string_of_tag(node).strip()
                url = node['href']
                if url.startswith(r'/'):
                    url = 'https://www.economist.com' + url
                    #self.log.info('\tFound article:%s' % title)
                if url not in urladded:
                    urls.append((sectitle, title, url, None))
                    urladded.add(url)

        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls