Python get_paragraphs_from_selector示例，util.get_paragraphs_from_selector Python示例

示例#1

0

显示文件

文件： spider_commercialradio.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, u'881903.com 商業電台 - ')
        t = util.get_time_string_from_selectors(
            doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'})
        t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M') + int(
            time.localtime().tm_sec)
        category = doc('#part8425_ctl00_divtitle').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc,
                                                    '#divnewsTextContent p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '#tdContent p')
        if content == '':
            content = util.get_paragraphs_from_selector(
                doc, '.newsTextContent2')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'CommercialRadio'
        item.task_no = self.BATCH_NUMBER

示例#2

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('h1').text()
        meta_txt = doc('.metaStuff').text()
        t = re.findall(ur'[^\s]+月.+', meta_txt)[0]
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = doc('#crumbs a').text().split(' ')[-1]
        author = meta_txt.split(' ')[0]
        content = util.get_paragraphs_from_selector(doc, '.entry p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '.entry div')
        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Savantas'
        item.task_no = self.BATCH_NUMBER

示例#3

0

显示文件

文件： spider_passiontimes.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'熱血時報 \| ')
        author = doc('span.author a').text()
        t = doc('time[class="published"]').text()
        t_stamp = int(time.mktime(time.strptime(str(doc('time[class="published"]').text()), "%m-%d-%Y")))
        category = doc('div.page-path a').text()
        doc.remove('script')
        doc.remove('style')
        content = util.get_paragraphs_from_selector(doc, 'div.article-body p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'div.article-body')
        if t_stamp >= int(time.mktime(time.strptime(str(time.localtime().tm_year) + str(time.localtime().tm_mon) + str(
                time.localtime().tm_mday), '%Y%m%d'))):
            t_stamp = int(time.time())

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'PassionTimes'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article-body img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                elif not re.match(r'http://.+', media_u):
                    media_u = 'http://www.passiontimes.hk' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description='',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data('462543587117177',
                                                                 doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments',
                                                description='comments', created_at=item.fetched_at)
                    )

示例#4

0

显示文件

文件： spider_headline.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - .*')
        t = ''
        t_stamp = 0
        if doc('time') or doc('span.date'):
            t = util.get_time_string_from_selectors(doc, {'time', 'span.date'})
            t_stamp = util.get_timestamp_from_string(t) + int(time.localtime().tm_sec)
        category = re.sub(ur'.*\s+', u'', doc('.dropdown-menu li.active a').text())
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#news-content')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '.content span')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HeadlineNews'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.content .item img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(12, 600):
            _comments = util.get_filtered_facebook_comments_data('978368502211772',
                                                                 doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments',
                                                description='comments', created_at=item.fetched_at)
                    )

示例#5

0

显示文件

文件： spider_hkej.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = ''
        t = ''
        t_stamp = 0
        category = ''
        author = ''
        content = ''

        if instant_pattern.match(task.url):
            title = util.get_filtered_title(doc, {'title'}, ur' - 信報網站 hkej.com')
            t = util.get_time_string_from_selectors(doc, {'span.date'})
            time_part = min_sec_pattern.findall(t)[0]
            t_stamp = util.get_timestamp_from_string(time_part) + time.localtime().tm_sec
            category = doc('span.cate').text()
            content = util.get_paragraphs_from_selector(doc, '#article-content p')
        elif daily_pattern.match(task.url) or headline_article_pattern.match(task.url):
            title = util.get_filtered_title(doc, {'title'}, ur' - .+')
            t = util.get_time_string_from_selectors(doc, {'#date'})
            t_stamp = util.get_timestamp_from_string(t) + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 + time.localtime().tm_sec
            category = doc('#hkej_navSubMenu_2014 .on').text()
            content = util.get_paragraphs_from_selector(doc, '#article-content p')
            if content == '':
                content = util.get_paragraphs_from_selector(doc, '#article-detail-wrapper')
                content = re.sub(ur'（節錄）(.|\n|\t|\r)*', u'', content, re.M | re.I | re.U)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKEJ'
        item.task_no = self.BATCH_NUMBER
        for img in doc('#article-detail-wrapper p img, #article-detail-wrapper .hkej_detail_thumb_2014 img').items():
            if img.parent('a').attr('href') != '':
                des = ''
                if img.parent('a') and img.parent('a').attr('title'):
                    des = img.parent('a').attr('title')
                media = self.NewsItem.MediaItem(media_url=img.parent('a').attr('href'), type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.*', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#6

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        item.raw = doc.text()
        item.title = util.get_filtered_title(doc, {'.article_tit h1'}, ur'\s*｜.*')
        if item.title == '':
            item.title = util.get_filtered_title(doc, {'title'}, ur'\s*｜.*')
        item.t = doc('meta[name=artpdate]').attr('content')
        item.t_stamp = int(time.mktime(time.strptime(item.t, "%Y-%m-%d %H:%M:%S")))
        item.fetched_at = task.fetched_at
        item.category = doc('meta[name=catname]').attr('content')
        item.author = doc('meta[name=author]').attr('content')
        content = util.get_paragraphs_from_selector(doc, 'div.article_content__module p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'li.article_summary_pt')
        item.content = content
        item.url = task.url
        item.source = 'HK01'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article__body__content img').items():
            if img.attr('src') != '' and not img.parents('.related_article'):
                media_u = img.attr('src')
                if media_u != '//cdn.hk01.com/media/dummy/default_image.png':
                    des = ''
                    if img.attr('alt'):
                        des = img.attr('alt')
                    media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                    created_at=item.fetched_at)
                    item.media_list.append(media)
        for img in doc('.article__body__content object[data-gallery-image="true"]').items():
            if img.attr('data'):
                media_u = img.attr('data')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data('1651866545051541', doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at)
                )
        '''

示例#7

0

显示文件

文件： spider_sun.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(
            doc, {'h1', 'font.heading', 'font[size="+2"]'})
        t = util.get_day_string(offset=self.OFFSET)
        t_stamp = util.get_day_stamp(self.OFFSET)
        category = ''
        if cat_pattern.findall(task.url):
            cat_word = cat_pattern.findall(task.url)[0]
            category = doc('.' + cat_word).text()
        if category == '':
            category = re.sub(ur' .*', u'', doc('td font').text())
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.newsText p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '#contentAD1 p')
        if content == '':
            _doc = doc('#contentAD1')
            _doc.remove('table')
            _doc.remove('span')
            content = util.get_paragraphs_from_selector(_doc, 'div')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'dd')
        if content == '':
            content = doc('.caption').next_all('p').text()
        if content == '':
            _doc = doc.parent('.caption').parent()
            _doc.remove('table').remove('span')
            content = _doc.text()
        if content == '':
            _doc = doc('.caption').parent()
            content = _doc.remove('table').text()
        if content == '':
            content = doc('.summaryPara').text()

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'SunDaily'
        item.task_no = self.BATCH_NUMBER

示例#8

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1.entry-title'})
        t = doc('meta[property="article:published_time"]').attr('content')
        t_stamp = 0
        if t:
            t_stamp = util.get_timestamp_from_string(t)
        category = ''
        cat_find_res = cat_pattern.findall(task.url)
        if cat_find_res:
            category = cat_find_res[0]
        category = u'科技/' + category
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#content div.entry p:not(.meta)')
        content = re.sub(ur'(來源：|來源:|Tags:).+', u'', content)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Unwire'
        item.task_no = self.BATCH_NUMBER

示例#9

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1 a'})
        t = util.get_time_string_from_selectors(doc, {'span.postdate'})
        t_stamp = util.get_timestamp_from_string(t) + int(
            time.localtime().tm_sec)
        category = doc('span.postcat a').text()
        author = doc('span.postauthor a').text()
        content = util.get_paragraphs_from_selector(doc, 'div p')
        content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I)
        content = re.sub(ur'大道之行也，天下為公，選賢與能，講信修睦。－－－《禮運．大同》[.\n\r\t]*', u'',
                         content, re.M | re.U | re.I)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'VJMedia'
        item.task_no = self.BATCH_NUMBER
        for img in doc(
                '#container img.size-full, #container img.size-large').items():
            if img.attr('src') != '':
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=img.attr('src'),
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for iframe in doc('iframe').items():
            if iframe.attr('src') and re.match(r'.*youtube\.com.+',
                                               iframe.attr('src')):
                media = self.NewsItem.MediaItem(media_url=iframe.attr('src'),
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '214585295294555',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))

示例#10

0

显示文件

文件： spider_golden.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t_divs = doc('h1').siblings('div').items()
        t = ''
        t_stamp = 0
        for _div in t_divs:
            if _div.css('color') == 'rgb(128, 128, 128)':
                t = _div.text()
                t_stamp = util.get_timestamp_from_string(t) + int(
                    time.localtime().tm_sec)
                break
        category = u'电子'
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.href_txt_blog2')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKGolden'
        item.task_no = self.BATCH_NUMBER

示例#11

0

显示文件

    def normal_item_solver(self, item, task, response):

        response.encoding = 'big5'
        doc = self.get_doc(response)

        title = doc('p.bigheading').text().split(' ')[-1]
        with self.url_time_dict_lock:
            t = self.url_time_dict[task.url]
            t_stamp = util.get_timestamp_from_string(t) + time.localtime(
            ).tm_hour * 3600 + time.localtime().tm_min * 60
        category = ''
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'p:not(.bigheading)')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Y28'
        item.task_no = self.BATCH_NUMBER

示例#12

0

显示文件

    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = doc('#page-h1').text()
        t = util.get_time_string_from_selectors(doc, {'h5 small'},
                                                {date_pattern})
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = doc('h5 small a').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.content-show p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Bauhinia'
        item.task_no = self.BATCH_NUMBER

示例#13

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' - 香港新浪')
        t = doc('div.news-datetime').text()
        t_stamp = int(
            time.mktime(time.strptime(t.encode('utf-8'), '%Y年%m月%d日 %H:%M')))
        scripts = doc('script').text()
        category = ''
        if re.findall(cat_pattern, scripts):
            category = re.findall(cat_pattern, scripts)[0]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.news-body p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Sina'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.news-body img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                if not ad_url_pattern.findall(media_u):
                    des = ''
                    if img.parent().attr('data-caption'):
                        des = img.parent().attr('data-caption')
                    media = self.NewsItem.MediaItem(media_url=media_u,
                                                    type='image',
                                                    description=des,
                                                    created_at=item.fetched_at)
                    item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '114907575364430',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))

示例#14

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h2'})
        t = util.get_time_string_from_selectors(doc, {'div.dateforarticle'})
        t_stamp = 0
        if relative_time_pattern.match(t):
            t_stamp = self._get_timestamp_from_relative_time_str(t)
        elif absolute_time_pattern.match(t):
            t_stamp = util.get_timestamp_from_string(t) + int(
                time.localtime().tm_sec)
        category = '新聞'
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#mymain')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'AM730'
        item.task_no = self.BATCH_NUMBER

示例#15

0

显示文件

文件： spider_edb.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('h1').text()
        t = ''
        category = ''
        with self.time_cat_dict_lock:
            if task.url in self.time_cat_dict:
                t = self.time_cat_dict[task.url][0]
                category = self.time_cat_dict[task.url][1]
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKEDB'
        item.task_no = self.BATCH_NUMBER

示例#16

0

显示文件

文件： spider_newcentforum.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'.article-title'})
        t = ''
        with self.url_time_dict_lock:
            t = self.url_time_dict[task.url]
        print t
        t_stamp = util.get_timestamp_from_string(t) + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 + time.localtime().tm_sec
        category = '新聞發佈'
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.article-content')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'NewCenturyForum'
        item.task_no = self.BATCH_NUMBER

示例#17

0

显示文件

    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'.article-header h1'})
        t = doc('.date').text()
        if date_pattern.findall(t):
            t = date_pattern.findall(t)[0]
        t_stamp = util.get_timestamp_from_string(t)
        category = doc('.now-here').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.article p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'JD Online'
        item.task_no = self.BATCH_NUMBER

示例#18

0

显示文件

文件： spider_chinadaily.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1.conttit'})
        t = util.get_time_string_from_selectors(doc, {'div.pubtime'})
        t_stamp = util.get_timestamp_from_string(t)
        category = 'hk'
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.contentbox p')
        content = re.sub(r'READMORE\: .+\n', '', content)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'ChinaDaily'
        item.task_no = self.BATCH_NUMBER
        for img in doc('div.contentbox img').items():
            if img.attr('src') != '':
                media_u = 'http://www.chinadailyasia.com/' + re.sub(r'.+(?=attachement)', '', img.attr('src'))
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#19

0

显示文件

文件： spider_metro.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur' – 都市日報')
        author = ''
        category = doc('.mobile-page-name span').text()
        # tags = doc('meta[name=keywords]').attr('content')
        content = util.get_paragraphs_from_selector(
            doc, '.main-content .content p')
        t = doc('.main-content .date p').text()
        t = re.sub(ur'\(.+?\)', '', t)
        t = re.sub(ur'上午', 'AM', t)
        t = re.sub(ur'下午', 'PM', t)
        t_stamp = util.get_timestamp_from_string(t)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'MetroHK'
        item.task_no = self.BATCH_NUMBER

示例#20

0

显示文件

文件： spider_macao.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('table[id=table15] strong').text()
        t = doc('table[id=table23]').text()
        year = re.findall(t_pattern, t)[0]
        mon = int(re.findall(ur'\d{1,2}(?= *月)', t)[0])
        day = int(re.findall(ur'\d{1,2}(?= *日)', t)[0])
        t_stamp = int(
            time.mktime(
                time.strptime(year + ('%02d' % mon) + ('%02d' % day),
                              '%Y%m%d')))
        current_date = str(
            time.localtime().tm_year) + ('%02d' % time.localtime().tm_mon) + (
                '%02d' % time.localtime().tm_mday)
        if t_stamp >= int(time.mktime(time.strptime(current_date, "%Y%m%d"))):
            t_stamp = int(time.time())
        category = doc('table[id=table22] strong').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'founder-content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Macao'
        item.task_no = self.BATCH_NUMBER

示例#21

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('h4').text()
        t = doc('#ContentPlaceHolder1_IndividualNewsList_lblTime_0').text()
        t_stamp = util.get_timestamp_from_string(t,
                                                 time_format='%d/%m/%Y %H:%M')
        category = ''
        cat = doc('title').text()
        if cat_pattern.findall(cat):
            category = cat_pattern.findall(cat)[0]
        author = ''
        content = util.get_paragraphs_from_selector(
            doc, '#ContentPlaceHolder1_IndividualNewsList_lblContent_0')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'MetroFinance'
        item.task_no = self.BATCH_NUMBER

        item.id = ''
        if news_id_pattern.findall(task.url):
            item.id = (news_id_pattern.findall(task.url)[0])[2:]

示例#22

0

显示文件

文件： spider_hksilicon.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t_stamp = 0
        t_stamp_url = doc('meta[property="og:image:url"]').attr('content')
        if t_stamp_url:
            f_res = url_stamp_pattern.findall(t_stamp_url)
            if f_res:
                t_stamp = int(f_res[0])
        t = ''
        if t_stamp:
            t = time.ctime(t_stamp)
        else:
            t = doc('ul.blog-info i.fa-calendar').parent('li').text()
            t_stamp = self.get_stamp_from_relative_timestr(t)
        category = doc('ul.blog-info i.fa-tags').siblings('a').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.blog-content')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'HKSilicon'
        item.task_no = self.BATCH_NUMBER

示例#23

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t = doc('meta[property="article:published_time"]').attr('content')
        t_stamp = 0
        if t:
            t_stamp = util.get_timestamp_from_string(t)
        category = ''
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#main-content .entry-content p')
        content = re.sub(ur'繼續閱讀[\n\s\S.]*', '', content)

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'UBeat'
        item.task_no = self.BATCH_NUMBER
        for img in doc('figure.entry-thumbnail img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#24

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = doc('#News_Body_Title').text()
        t = ''
        if doc('#News_Body_Time'):
            t = date_pattern.findall(str(doc('#News_Body_Time')))[0]
        t_stamp = util.get_timestamp_from_string(
            t) + time.localtime().tm_hour * 3600 + time.localtime(
            ).tm_min * 60 + time.localtime().tm_sec
        category = doc('.Top_Index_A a:last-child').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc, '#News_Body_Txt_A p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'FMCOPRC'
        item.task_no = self.BATCH_NUMBER

示例#25

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'}, ur'\s*\|.*')
        t = doc('div.article-content-wrap p.date').text()
        t_stamp = int(
            time.mktime(time.strptime(t.encode('utf-8'), '%Y/%m/%d — %H:%M')))
        cat_href = cat_pattern.findall(task.url)[0]
        category = doc('ul[id=mainMenuUL] a[href="' + cat_href + '"]').text()
        author = ''
        content = util.get_paragraphs_from_selector(doc.remove('style'),
                                                    'div.article-content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'StandNews'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article-content-wrap img').items():
            if img.attr('src') != '' and not ad_image_pattern.match(
                    img.attr('src')):
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=img.attr('src'),
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for iframe in doc('.article-content-wrap iframe').items():
            if iframe.attr('src') and youtube_pattern.match(
                    iframe.attr('src')):
                media = self.NewsItem.MediaItem(media_url=iframe.attr('src'),
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '1534089350179685',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))

示例#26

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'},
                                        u' - BBC 中文网| BBC Zhongwen')
        t = ''
        t_stamp = 0
        if doc('.story-body .mini-info-list .date').attr('data-datetime'):
            t = doc('.story-body .mini-info-list .date').attr('data-datetime')
            t_stamp = util.get_timestamp_from_string(
                t) + time.localtime().tm_sec
        elif doc('.timeline-status h3') and date_pattern.findall(
                doc('.timeline-status h3').text()):
            t = date_pattern.findall(doc('.timeline-status h3').text())[0]
            t_stamp = util.get_timestamp_from_string(
                t) + time.localtime().tm_sec
        elif doc('.story-body .date strong') and date_pattern.findall(
                doc('.story-body .date strong').text()):
            t = date_pattern.findall(doc('.story-body .date strong').text())[0]
            t_stamp = util.get_timestamp_from_string(
                t) + time.localtime().tm_sec
        category = doc('meta[property="article:section"]').attr('content')
        author = doc('span.byline__name').text()
        content = util.get_paragraphs_from_selector(
            doc, 'div[property="articleBody"] p')
        if content == '':
            content = util.get_paragraphs_from_selector(
                doc, '.article-wrapper p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '.map-body p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'BBC Chinese'
        item.task_no = self.BATCH_NUMBER

示例#27

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'}, r' The Standard$')
        pl = doc('.heading .pull-left')
        pl.remove('span')
        t = pl.text().split(' | ')[1]
        if t:
            t_stamp = util.get_timestamp_from_string(t)
        else:
            t_stamp = 0
        category = pl.text().split(' | ')[0]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.content p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'div.content')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Standard'
        item.task_no = self.BATCH_NUMBER
        for img in doc('div.content img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#28

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'title'},
                                        ur'\s*-\s*(香港文匯報|香港文匯網)')
        t = doc('span.date').text()
        t_stamp = util.get_timestamp_from_string(t)
        current_date = str(
            time.localtime().tm_year) + ('%02d' % time.localtime().tm_mon) + (
                '%02d' % time.localtime().tm_mday)
        if t_stamp >= int(time.mktime(time.strptime(current_date, "%Y%m%d"))):
            t_stamp = util.get_now()
        category = ''
        if re.findall(cat_pattern, doc('span.current').text().encode('utf-8')):
            category = re.findall(
                cat_pattern,
                doc('span.current').text().encode('utf-8'))[-1]
        author = ''
        content = util.get_paragraphs_from_selector(doc,
                                                    'div[id=main-content] p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'WenWei'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.imgtxt img').items():
            if img.attr('src') != '':
                media_u = img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)
        for a in doc('iframe').items():
            if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')):
                media_u = a.attr('src')
                if re.match(r'//.+', media_u):
                    media_u = 'http:' + media_u
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='youtube',
                                                description='youtube',
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#29

0

显示文件

文件： spider_xinhua.py 项目： XufengWU/NSpider

    def normal_item_solver(self, item, task, response):

        response.encoding = 'utf-8'
        doc = self.get_doc(response)

        title = doc('h1').text()
        t = doc('span.time').text()
        if t == '':
            t = doc('#pubtime').text()
        t_stamp = util.get_timestamp_from_string(t)
        category = '港澳'
        author = ''
        content = util.get_paragraphs_from_selector(doc, '.article p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, '#content p')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'Xinhua'
        item.task_no = self.BATCH_NUMBER
        for img in doc('.article img').items():
            if img.attr('src') != '':
                media_u = re.sub(r'/[^/]+\.htm', '/',
                                 task.url) + img.attr('src')
                des = ''
                if img.attr('alt'):
                    des = img.attr('alt')
                elif img.siblings('p'):
                    des = img.siblings('p').text()
                media = self.NewsItem.MediaItem(media_url=media_u,
                                                type='image',
                                                description=des,
                                                created_at=item.fetched_at)
                item.media_list.append(media)

示例#30

0

显示文件

    def normal_item_solver(self, item, task, response):

        doc = self.get_doc(response)

        title = util.get_filtered_title(doc, {'h1'})
        t = doc('time.published').text()
        t_stamp = util.get_timestamp_from_string(
            doc('time.published').attr('datetime'), '%Y-%m-%d %H:%M:%S+0800')
        category = doc('#navBar li.active').text()
        if category == '' and cat_pattern_in_url.findall(task.url):
            category = cat_pattern_in_url.findall(task.url)[0]
        author = ''
        content = util.get_paragraphs_from_selector(doc, 'div.newsLeading p')
        if content == '':
            content = util.get_paragraphs_from_selector(doc, 'div.newsLeading')

        item.raw = doc.text()
        item.title = title
        item.t = t
        item.t_stamp = t_stamp
        item.fetched_at = task.fetched_at
        item.category = category
        item.author = author
        item.content = content
        item.url = task.url
        item.source = 'NowNews'
        item.task_no = self.BATCH_NUMBER
        if util.within_active_interval(6, 1200):
            _comments = util.get_filtered_facebook_comments_data(
                '515076798590105',
                doc('div.fb-comments').attr('data-href'), task.url)
            if _comments:
                for _comment in _comments:
                    item.media_list.append(
                        self.NewsItem.MediaItem(
                            media_url=_comment['json_string'],
                            type='comments',
                            description='comments',
                            created_at=item.fetched_at))