Пример #1
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = ''
        t = ''
        t_stamp = 0
        category = ''
        author = ''
        content = ''

        if instant_pattern.match(task.url):
            title = util.get_filtered_title(doc, {'title'}, ur' - 信報網站 hkej.com')
            t = util.get_time_string_from_selectors(doc, {'span.date'})
            time_part = min_sec_pattern.findall(t)[0]
            t_stamp = util.get_timestamp_from_string(time_part) + time.localtime().tm_sec
            category = doc('span.cate').text()
            content = util.get_paragraphs_from_selector(doc, '#article-content p')
        elif daily_pattern.match(task.url) or headline_article_pattern.match(task.url):
            title = util.get_filtered_title(doc, {'title'}, ur' - .+')
            t = util.get_time_string_from_selectors(doc, {'#date'})
            t_stamp = util.get_timestamp_from_string(t) + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 + time.localtime().tm_sec
            category = doc('#hkej_navSubMenu_2014 .on').text()
            content = util.get_paragraphs_from_selector(doc, '#article-content p')
            if content == '':
                content = util.get_paragraphs_from_selector(doc, '#article-detail-wrapper')
                content = re.sub(ur'(節錄)(.|\n|\t|\r)*', u'', content, re.M | re.I | re.U)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKEJ'
        item.task_no = self.BATCH_NUMBER
        for img in doc('#article-detail-wrapper p img, #article-detail-wrapper .hkej_detail_thumb_2014 img').items():
            if img.parent('a').attr('href') != '':
                des = ''
                if img.parent('a') and img.parent('a').attr('title'):
                    des = img.parent('a').attr('title')
                media = self.NewsItem.MediaItem(media_url=img.parent('a').attr('href'), type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.*', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Пример #2
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h2'})
        t = util.get_time_string_from_selectors(doc, {'div.dateforarticle'})
        t_stamp = 0
        if relative_time_pattern.match(t):
            t_stamp = self._get_timestamp_from_relative_time_str(t)
        elif absolute_time_pattern.match(t):
            t_stamp = util.get_timestamp_from_string(t) + int(
                time.localtime().tm_sec)
        category = '新聞'
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#mymain')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'AM730'
        item.task_no = self.BATCH_NUMBER
Пример #3
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'.PlaylistRow kanhanpass'})
        t = util.get_time_string_from_selectors(doc, {'.PlaylistRow td'},
                                                date_patterns={date_pattern})
        t_stamp = util.get_timestamp_from_string(t, '%Y/%m/%d %I:%M %p') + int(
            time.localtime().tm_sec)
        category = doc('#topMenu a.on').text()
        author = ''
        c_id = re.findall(url_id_pattern, task.url)[0]
        r = requests.get(
            'http://webcast.legco.gov.hk/Public_uat_embedded/Service1.asmx/GetTimeMarker?meetingID='
            + c_id + '&lang=tc')
        j_obj = json.loads(r.text)
        content = ''
        for agenda in j_obj['TimeMarkerItems']:
            a_time = agenda['AgendaTime']
            a_item = agenda['AgendaItem']
            content = content + a_time + u' - ' + a_item + u'\n'

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'LegislativeCouncil'
        item.task_no = self.BATCH_NUMBER
Пример #4
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1.conttit'})
        t = util.get_time_string_from_selectors(doc, {'div.pubtime'})
        t_stamp = util.get_timestamp_from_string(t)
        category = 'hk'
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.contentbox p')
        content = re.sub(r'READMORE\: .+\n', '', content)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'ChinaDaily'
        item.task_no = self.BATCH_NUMBER
        for img in doc('div.contentbox img').items():
            if img.attr('src') != '':
                media_u = 'http://www.chinadailyasia.com/' + re.sub(r'.+(?=attachement)', '', img.attr('src'))
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Пример #5
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, u'881903.com 商業電台 - ')
        t = util.get_time_string_from_selectors(
            doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'})
        t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M') + int(
            time.localtime().tm_sec)
        category = doc('#part8425_ctl00_divtitle').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc,
                                                    '#divnewsTextContent p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '#tdContent p')
        if content == '':
            content = util.get_paragraphs_from_selector(
                doc, '.newsTextContent2')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'CommercialRadio'
        item.task_no = self.BATCH_NUMBER
Пример #6
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1 a'})
        t = util.get_time_string_from_selectors(doc, {'span.postdate'})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        category = doc('span.postcat a').text()
        author = doc('span.postauthor a').text()
        content = util.get_paragraphs_from_selector(doc, 'div p')
        content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'大道之行也,天下為公,選賢與能,講信修睦。---《禮運.大同》[.\n\r\t]*', u'',
                         content, re.M | re.U | re.I)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'VJMedia'
        item.task_no = self.BATCH_NUMBER
        for img in doc(
                '#container img.size-full, #container img.size-large').items():
            if img.attr('src') != '':
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=img.attr('src'),
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for iframe in doc('iframe').items():
            if iframe.attr('src') and re.match(r'.*youtube\.com.+',
                                               iframe.attr('src')):
                media = self.NewsItem.MediaItem(media_url=iframe.attr('src'),
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '214585295294555',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
Пример #7
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = doc('#page-h1').text()
        t = util.get_time_string_from_selectors(doc, {'h5 small'},
                                                {date_pattern})
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = doc('h5 small a').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.content-show p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Bauhinia'
        item.task_no = self.BATCH_NUMBER
Пример #8
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = util.get_time_string_from_selectors(doc, {'span.time'})
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
Пример #9
0
 def task_filter(self, doc, url, doc_url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if not reg_pattern.match(doc_url):
                 return True
             t = util.get_time_string_from_selectors(doc, {'#article_date'})
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
     return False
Пример #10
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('div.post_time'):
                 t = util.get_time_string_from_selectors(doc, {'div.post_time'})
                 t_stamp = util.get_timestamp_from_string(t)
                 if t_stamp >= util.get_day_stamp():
                     return True
     return wanted
Пример #11
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = util.get_time_string_from_selectors(
                 doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'})
             t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M')
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
Пример #12
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - .*')
        t = ''
        t_stamp = 0
        if doc('time') or doc('span.date'):
            t = util.get_time_string_from_selectors(doc, {'time', 'span.date'})
            t_stamp = util.get_timestamp_from_string(t) + int(time.localtime().tm_sec)
        category = re.sub(ur'.*\s+', u'', doc('.dropdown-menu li.active a').text())
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#news-content')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '.content span')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HeadlineNews'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.content .item img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(12, 600):
            _comments = util.get_filtered_facebook_comments_data('978368502211772',
                                                                 doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments',
                                                description='comments', created_at=item.fetched_at)
                    )
Пример #13
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = util.get_time_string_from_selectors(
                 doc, {'.PlaylistRow td'}, date_patterns={date_pattern})
             t_stamp = util.get_timestamp_from_string(
                 t, '%Y/%m/%d %I:%M %p')
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
Пример #14
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('td'):
                 t = util.get_time_string_from_selectors(doc, {'td'}, {date_pattern})
                 t_stamp = util.get_timestamp_from_string(t, '%Y-%m-%d %H:%M:%S')
                 if t_stamp >= util.get_month_day_timestamp(self.OFFSET):
                     return True
                 return False
             return False
     return False
Пример #15
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        item.raw = doc.text()
        item.title = util.get_filtered_title(doc, {'h1'})
        item.t = util.get_time_string_from_selectors(doc, {'h6'})
        item.t_stamp = 0
        if re.findall(ur'上午', item.t):
            item.t_stamp = util.get_timestamp_from_string(
                re.sub(ur'上午', u'AM', item.t), u'%Y年%m月%d日 %p%I:%M')
Пример #16
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('div.video_date span'):
                 t = util.get_time_string_from_selectors(
                     doc, {'div.video_date span'},
                     date_patterns={date_pattern})
                 t_stamp = util.get_timestamp_from_string(t)
                 if t_stamp >= util.get_day_stamp(self.OFFSET):
                     return True
     return wanted
Пример #17
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if date_pattern.findall(doc('span.pull-right').text()):
                 t = util.get_time_string_from_selectors(
                     doc, {'span.pull-right'}, date_patterns={date_pattern})
                 t_stamp = util.get_timestamp_from_string(
                     t, time_format=u'%Y 年 %m 月 %d 日')
                 if t_stamp >= util.get_day_stamp(self.OFFSET):
                     return True
     return wanted
Пример #18
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' \| .*')
        t = util.get_time_string_from_selectors(doc, {'p.dateFormat'})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        category = ''
        if re.findall(ur'(?<=hongkong/)\d+-.+?(?=/\d+.*)', task.url):
            cat_part = re.findall(ur'(?<=hongkong/)\d+-.+?(?=/\d+.*)',
                                  task.url)[0]
            category = re.findall(ur'(?<=-).*', cat_part)[0]
Пример #19
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = util.get_time_string_from_selectors(
                 doc, {'div.dateforarticle'})
             t_stamp = 0
             if relative_time_pattern.match(t):
                 t_stamp = self._get_timestamp_from_relative_time_str(t)
             elif absolute_time_pattern.match(t):
                 t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
Пример #20
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, u'中國評論新聞:')
        t = util.get_time_string_from_selectors(doc, {'td'}, {date_pattern})
        t_stamp = util.get_timestamp_from_string(t, '%Y-%m-%d %H:%M:%S')
        category = ''
        scripts = doc('script').text()
        if re.findall(ur'coluid=\d+', task.url):
            col_id_str = re.findall(ur'coluid=\d+', task.url)[0]
            cat_block = re.findall(r'<a[^<>]*?' + col_id_str + '.+?</a>', scripts)[-1]
            cat_doc = pq(cat_block)
            category = cat_doc.text()
Пример #21
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('h6'):
                 t = util.get_time_string_from_selectors(doc, {'h6'})
                 t_stamp = 0
                 if re.findall(ur'上午', t):
                     t_stamp = util.get_timestamp_from_string(
                         re.sub(ur'上午', u'AM', t), u'%Y年%m月%d日 %p%I:%M')
                 elif re.findall(ur'下午', t):
                     t_stamp = util.get_timestamp_from_string(
                         re.sub(ur'下午', u'PM', t), u'%Y年%m月%d日 %p%I:%M')
                 if t_stamp >= util.get_day_stamp(offset=self.OFFSET):
                     return True
Пример #22
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        item.raw = doc.text()
        item.title = util.get_filtered_title(doc, {'h1'})
        item.t = util.get_time_string_from_selectors(doc, {'div.post_time'})
        item.t_stamp = util.get_now()
        item.fetched_at = task.feteched_at
        item.category = doc('div.post_cats a:last-child').text()
        item.author = doc('.single_author a').text()
        item.content = util.get_paragraphs_from_selector(doc, 'div.single_text p')
        item.url = task.url
        item.source = 'DMHK'
        item.task_no = self.BATCH_NUMBER
Пример #23
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'\|.*')
        t = util.get_time_string_from_selectors(doc, {'span.posted-time'})
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = doc('span.channel-section').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc,
                                                    'div.article-content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.redirected_url
        item.source = 'Initium'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.main-content .image img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Пример #24
0
 def task_filter(self, doc, url, doc_url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if year_date_pattern.findall(url):
                 t = year_date_pattern.findall(url)[0]
                 t_stamp = util.get_timestamp_from_string(t)
             else:
                 if doc('time') or doc('span.date'):
                     t = util.get_time_string_from_selectors(doc, {'time', 'span.date'})
                     t_stamp = util.get_timestamp_from_string(t)
                 else:
                     return True
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
Пример #25
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - RTHK')
        t = util.get_time_string_from_selectors(doc, {'div.createddate'})
        t_stamp = util.get_timestamp_from_string(
            t, '%Y-%m-%d HKT %H:%M') + time.localtime().tm_sec
        category = ''
        if cat_pattern.findall(doc('script').text()):
            category = cat_pattern.findall(doc('script').text())[-1]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.itemFullText')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'RTHK'
        item.task_no = self.BATCH_NUMBER
        for img in doc('img.imgPhotoAfterLoad').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Пример #26
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t = util.get_time_string_from_selectors(doc, {'time.timestamp'})
        t_stamp = util.get_timestamp_from_string(t)
        cat = doc('body').attr('class')
        category = ''
        if doc('.article-breadCrumb a'):
            category = doc('.article-breadCrumb a').text()
        author = doc('.author span').text()
        content = util.get_paragraphs_from_selector(doc, '#A p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'WSJ'
        item.task_no = self.BATCH_NUMBER
        for img in doc('#A .media-object-image img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
Пример #27
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        item.raw = doc.text()
        item.title = util.get_filtered_title(doc, {'title'},
                                             u'東周網【東周刊官方網站】| - .+')
        item.t = util.get_time_string_from_selectors(
            doc, {'span.pull-right'}, date_patterns={date_pattern})
        item.t_stamp = util.get_timestamp_from_string(
            item.t, time_format=u'%Y 年 %m 月 %d 日') + time.localtime(
            ).tm_hour * 3600 + time.localtime().tm_min * 60 + time.localtime(
            ).tm_sec
        item.fetched_at = task.fetched_at
        item.category = re.sub(ur'.*\s+', u'', doc('.default-group a').text())
        item.author = ''
        item.content = util.get_paragraphs_from_selector(
            doc, 'div.view-content p')
        item.url = task.url
        item.source = 'EastWeek'
        item.task_no = self.BATCH_NUMBER
Пример #28
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        t = util.get_time_string_from_selectors(doc, {'span.time'})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        title = doc('h4').remove('span').text()
        category = doc('#topMenu a.on').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#c1_afterplayer pre')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'TVB'
        item.task_no = self.BATCH_NUMBER
Пример #29
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'})
        t = util.get_time_string_from_selectors(doc, {'.tm'}, date_patterns)
        t_stamp = util.get_timestamp_from_string(t)
        category = (doc('h2 b').text())[:-2]
        author = doc('.fdr span').text()
        content = util.get_paragraphs_from_selector(doc, '.wz_nr p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKCNA'
        item.task_no = self.BATCH_NUMBER
Пример #30
0
 def get_auto_configured_spider(cls, offset=0):
     legco_seed = {'http://webcast.legco.gov.hk/public/zh-hk/SearchResult'}
     r = requests.get(
         'http://webcast.legco.gov.hk/public/zh-hk/SearchResult')
     headers = r.headers
     headers[
         'Referer'] = 'http://webcast.legco.gov.hk/public/zh-hk/SearchResult'
     _page = 1
     while True:
         r = requests.get(
             'http://webcast.legco.gov.hk/public/zh-hk/SearchResult?page=' +
             str(_page),
             headers=headers)
         d = pq(r.text)
         t = util.get_time_string_from_selectors(
             d, {'tr.PlaylistRow td'}, date_patterns={date_pattern})
         t_stamp = util.get_timestamp_from_string(t, '%Y/%m/%d %I:%M %p')
         if t_stamp >= util.get_day_stamp(offset):
             _page += 1
             for entry in d('tr.PlaylistRow').items():
                 if re.findall(entry_id_pattern, entry.attr('id')):
                     c_id = re.findall(entry_id_pattern,
                                       entry.attr('id'))[0]
                     legco_seed.add(
                         'http://webcast.legco.gov.hk/public/zh-hk/SearchResult?MeetingID='
                         + c_id)
         else:
             break
     spider_legco = SpiderLegco(
         'SpiderLegco',
         legco_seed, {
             ur'http://webcast\.legco\.gov\.hk/public/zh-hk/SearchResult\?MeetingID=.+'
         },
         THREAD_NUM=5)
     spider_legco.OFFSET = offset
     spider_legco.BATCH_NUMBER = util.get_day_stamp() + 10300
     return spider_legco