示例#1
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1 a'})
        t = util.get_time_string_from_selectors(doc, {'span.postdate'})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        category = doc('span.postcat a').text()
        author = doc('span.postauthor a').text()
        content = util.get_paragraphs_from_selector(doc, 'div p')
        content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'大道之行也,天下為公,選賢與能,講信修睦。---《禮運.大同》[.\n\r\t]*', u'',
                         content, re.M | re.U | re.I)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'VJMedia'
        item.task_no = self.BATCH_NUMBER
        for img in doc(
                '#container img.size-full, #container img.size-large').items():
            if img.attr('src') != '':
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=img.attr('src'),
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for iframe in doc('iframe').items():
            if iframe.attr('src') and re.match(r'.*youtube\.com.+',
                                               iframe.attr('src')):
                media = self.NewsItem.MediaItem(media_url=iframe.attr('src'),
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '214585295294555',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#2
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - 香港新浪')
        t = doc('div.news-datetime').text()
        t_stamp = int(
            time.mktime(time.strptime(t.encode('utf-8'), '%Y年%m月%d日 %H:%M')))
        scripts = doc('script').text()
        category = ''
        if re.findall(cat_pattern, scripts):
            category = re.findall(cat_pattern, scripts)[0]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.news-body p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Sina'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.news-body img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if not ad_url_pattern.findall(media_u):
                    des = ''
                    if img.parent().attr('data-caption'):
                        des = img.parent().attr('data-caption')
                    media = self.NewsItem.MediaItem(media_url=media_u,
                                                    type='image',
                                                    description=des,
                                                    created_at=item.fetched_at)
                    item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '114907575364430',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#3
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'\s*\|.*')
        t = doc('div.article-content-wrap p.date').text()
        t_stamp = int(
            time.mktime(time.strptime(t.encode('utf-8'), '%Y/%m/%d — %H:%M')))
        cat_href = cat_pattern.findall(task.url)[0]
        category = doc('ul[id=mainMenuUL] a[href="' + cat_href + '"]').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc.remove('style'),
                                                    'div.article-content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'StandNews'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article-content-wrap img').items():
            if img.attr('src') != '' and not ad_image_pattern.match(
                    img.attr('src')):
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=img.attr('src'),
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for iframe in doc('.article-content-wrap iframe').items():
            if iframe.attr('src') and youtube_pattern.match(
                    iframe.attr('src')):
                media = self.NewsItem.MediaItem(media_url=iframe.attr('src'),
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '1534089350179685',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#4
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'熱血時報 \| ')
        author = doc('span.author a').text()
        t = doc('time[class="published"]').text()
        t_stamp = int(time.mktime(time.strptime(str(doc('time[class="published"]').text()), "%m-%d-%Y")))
        category = doc('div.page-path a').text()
        doc.remove('script')
        doc.remove('style')
        content = util.get_paragraphs_from_selector(doc, 'div.article-body p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'div.article-body')
        if t_stamp >= int(time.mktime(time.strptime(str(time.localtime().tm_year) + str(time.localtime().tm_mon) + str(
                time.localtime().tm_mday), '%Y%m%d'))):
            t_stamp = int(time.time())

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'PassionTimes'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article-body img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                elif not re.match(r'http://.+', media_u):
                    media_u = 'http://www.passiontimes.hk' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description='',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data('462543587117177',
                                                                 doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments',
                                                description='comments', created_at=item.fetched_at)
                    )
示例#5
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        item.raw = doc.text()
        item.title = util.get_filtered_title(doc, {'.article_tit h1'}, ur'\s*|.*')
        if item.title == '':
            item.title = util.get_filtered_title(doc, {'title'}, ur'\s*|.*')
        item.t = doc('meta[name=artpdate]').attr('content')
        item.t_stamp = int(time.mktime(time.strptime(item.t, "%Y-%m-%d %H:%M:%S")))
        item.fetched_at = task.fetched_at
        item.category = doc('meta[name=catname]').attr('content')
        item.author = doc('meta[name=author]').attr('content')
        content = util.get_paragraphs_from_selector(doc, 'div.article_content__module p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'li.article_summary_pt')
        item.content = content
        item.url = task.url
        item.source = 'HK01'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article__body__content img').items():
            if img.attr('src') != '' and not img.parents('.related_article'):
                media_u = img.attr('src')
                if media_u != '//cdn.hk01.com/media/dummy/default_image.png':
                    des = ''
                    if img.attr('alt'):
                        des = img.attr('alt')
                    media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                    created_at=item.fetched_at)
                    item.media_list.append(media)
        for img in doc('.article__body__content object[data-gallery-image="true"]').items():
            if img.attr('data'):
                media_u = img.attr('data')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data('1651866545051541', doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at)
                )
        '''
示例#6
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - .*')
        t = ''
        t_stamp = 0
        if doc('time') or doc('span.date'):
            t = util.get_time_string_from_selectors(doc, {'time', 'span.date'})
            t_stamp = util.get_timestamp_from_string(t) + int(time.localtime().tm_sec)
        category = re.sub(ur'.*\s+', u'', doc('.dropdown-menu li.active a').text())
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#news-content')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '.content span')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HeadlineNews'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.content .item img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(12, 600):
            _comments = util.get_filtered_facebook_comments_data('978368502211772',
                                                                 doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments',
                                                description='comments', created_at=item.fetched_at)
                    )
示例#7
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t = doc('time.published').text()
        t_stamp = util.get_timestamp_from_string(
            doc('time.published').attr('datetime'), '%Y-%m-%d %H:%M:%S+0800')
        category = doc('#navBar li.active').text()
        if category == '' and cat_pattern_in_url.findall(task.url):
            category = cat_pattern_in_url.findall(task.url)[0]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.newsLeading p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'div.newsLeading')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'NowNews'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '515076798590105',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#8
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'})
        t = util.get_time_string_from_selectors(doc, {'div.video_date span'},
                                                date_patterns={date_pattern})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        category = doc('span.heretxt').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.video_content')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'CableNews'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '482092838576644',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#9
0
    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, u' \|.*')
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime().tm_min * 60
        category = doc('meta[name=subsection]').attr('content')
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#masterContent p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'AppleNews'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 20 * 60):
            _comments = util.get_filtered_facebook_comments_data(
                '367495573302576',
                doc('meta[property="og:url"]').attr('content'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#10
0
    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'\s*\|.*')
        t = util.get_time_string_from_selectors(
            doc, {'div.PolDTextBox_Date', 'div.MainNews_Date'})
        t = t[-4:] + t[3:5] + t[:2]
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = ''
        if cat_pattern.findall(task.url):
            category = cat_pattern.findall(task.url)[0]
            category = urllib2.unquote(category)
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.PolDTextBox div')
        if content == '':
            content = util.get_paragraphs_from_selector(
                doc, 'div.MainNews_Text div')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'SkyPost'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.PolDetailBox img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.parent().siblings('.PolDCaption'):
                    des = img.parent().siblings('.PolDCaption').text()
                elif img.parents('.NewsDPicBox02'):
                    des = img.parents('.NewsDPicBox02').text()
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '335749279848103',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))
示例#11
0
                doc, '#articleContent p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'BastillePost'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '617988554913649',
                doc('#fbComments').attr('href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))

    @classmethod
    def get_auto_configured_spider(cls, offset=0):
        bastille_seed = {'http://www.bastillepost.com/hongkong/'}
        r = requests.get('http://www.bastillepost.com/hongkong/')
        d = pq(r.text)