예제 #1
0
    def parse_article(self, response):
        article = ItemLoader(item=NewsCrawlerItem(), response=response)
        article.add_value('country', 'usa')
        article.add_xpath('language', '//html/@lang')
        article.nested_css('meta[property="og:title"]').add_xpath(
            'headline', './@content')
        article.add_value('url', response.url)
        # Function to parse published time to iso6801
        time_in = Compose(
            Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else
            ciso8601.parse_datetime(v).isoformat(sep='T'))
        article.nested_css(
            'meta[property="og:article:published_time"]').add_xpath(
                'published_time',
                './@content',
                time_in,
            )
        article.add_xpath('keywords',
                          '//head/meta[@property="keywords"]/@content',
                          lambda v: v[0].split(',') if v else None)
        article.add_value('encoding', response.encoding)
        article.nested_css('span[id="article-text"]').add_xpath(
            'body', './p//text()')

        return article.load_item()
예제 #2
0
 def parse_article(self, response):
     # 处理文章页面
     item = NewsCrawlerItem()
     item['channel'] = 4
     item['source'] = u'汽车之家'
     item['source_url'] = response.url
     # 从文章url中解析出虎扑文章ID
     huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html', 'article_id', item['source_url'])
     # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID
     item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id)
     item['id'] = item['newsId']
     title_list = response.xpath('//div[@class="article-title"]/h1/text()').extract()
     item['title'] = title_list[0] if title_list else response.xpath('/html/head/title/text()').extract()[0]
     time_str_list = response.xpath('//div[@class="article-title"]/p/text()').extract()
     time_str = time_str_list[0].strip() if time_str_list else ''
     # 汽车之家时间格式为:类型:原创  2015-07-05  汽车之家  编辑:
     item['publishTime'] = self.get_time(time_str)
     big_pic_list = response.xpath(
         '//div[@class="article-content"]/p/img[contains(@data-src, ".autoimg.cn/")]/@data-src').extract()
     if big_pic_list:
         item['picOne'] = big_pic_list[0]
     if len(big_pic_list) > 1:
         item['picTwo'] = big_pic_list[1]
     if len(big_pic_list) > 2:
         item['picThr'] = big_pic_list[2]
     pic_list = big_pic_list
     item['picListString'] = ','.join(pic_list)
     item['picList'] = pic_list
     item['isLarge'] = True
     return item
예제 #3
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'middle east')
     article.add_value('language', 'english')
     article.add_value('stopwords', ['al', 'jazeera'])
     article.nested_css('div.article-body').add_xpath('body', './p//text()')
     article.nested_css('meta[property="og:title"]').add_xpath(
         'headline', './@content')
     # Function to parse published time to iso6801
     published_time_in = Compose(
         Join(), lambda v: '' if (datetime.strptime(
             v, '%a, %d %B %Y %H:%M:%S %Z') is None) else datetime.strptime(
                 v, '%a, %d %B %Y %H:%M:%S %Z').isoformat(sep='T'))
     article.nested_css('meta[name="LastModifiedDate"]').add_xpath(
         'published_time',
         './@content',
         published_time_in,
     )
     article.nested_css('span.article-topics').add_xpath(
         'category', './/text()')
     article.nested_css('meta[property="ContentType"]').add_xpath(
         'category', './@content')
     article.add_value('url', response.url)
     article.add_value('encoding', response.encoding)
     return article.load_item()
예제 #4
0
 def parse_article(self, response):
     # 处理文章页面
     item = NewsCrawlerItem()
     item['channel'] = 2
     item['source'] = u'虎扑'
     item['source_url'] = response.url
     # 从文章url中解析出虎扑文章ID
     huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html', 'article_id', item['source_url'])
     # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID
     item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id)
     item['id'] = item['newsId']
     title_list = response.xpath('//header[@class="artical-title"]//h1[@class="headline"]/text()').extract()
     item['title'] = title_list[0] if title_list else response.xpath('/html/head/title/text()').extract()[0]
     time_str_list = response.xpath(
         '//header[@class="artical-title"]//span[@class="times"]/text()').extract()
     time_str = time_str_list[0].strip() if time_str_list else ''
     # 虎嗅时间格式为:48分钟前, 5小时前,  07-03 19:24
     item['publishTime'] = self.get_hupu_time(time_str)
     item['tags'] = response.xpath(
         '//section[@class="detail-content"]//div[contains(@class, "hot-tags")]//div[@class="swiper-slide"]/a/text()').extract()
     big_pic_list = response.xpath(
         '//section[@class="detail-content"]/article[@class="article-content"]/img/@src').extract()
     if big_pic_list:
         item['picOne'] = big_pic_list[0]
     if len(big_pic_list) > 1:
         item['picTwo'] = big_pic_list[1]
     if len(big_pic_list) > 2:
         item['picThr'] = big_pic_list[2]
     pic_list = big_pic_list
     item['picListString'] = ','.join(pic_list)
     item['picList'] = pic_list
     item['isLarge'] = True
     return item
예제 #5
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'usa')
     article.add_value('language', 'english')
     article.nested_css('meta[property="og:title"]').add_xpath(
         'headline', './@content')
     article.add_value('url', response.url)
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else
         ciso8601.parse_datetime(v).isoformat(sep='T'))
     article.nested_css('meta[name="pubdate"]').add_xpath(
         'published_time',
         './@content',
         time_in,
     )
     article.add_xpath('category', '//head/meta[@name="section"]/@content')
     article.add_xpath('keywords',
                       '//head/meta[@itemprop="keywords"]/@content',
                       re=r'(.*) -')
     article.add_value('encoding', response.encoding)
     article.nested_css('div.pg-rail-tall__body').nested_css(
         'div.l-container').add_xpath(
             'body', './/div[re:test(@class, "zn-.*")]/text()')
     return article.load_item()
예제 #6
0
    def parse_item(self, response):
        # item pipeline: add another to filter out one with too short of a body, or a media source (vid)

        item = NewsCrawlerItem()
        item["url"] = response.url
        item["content"] = response.body

        yield item
예제 #7
0
    def parse(self, response):
        for row in response.xpath(
                r"//div[@class='col-md-6 further-news-container latest-news-padding']"
        ):
            l = ItemLoader(item=NewsCrawlerItem(), selector=row)
            l.add_xpath(
                "DateTime",
                r"div/div/span[@class='warmGrey source-and-publishdate']/text()[2]"
            )
            l.add_xpath("News", r"div/div/a[@class='news-link']/text()")

            yield l.load_item()
예제 #8
0
 def parse_article(self, response):
     # 处理文章页面
     item = NewsCrawlerItem()
     item['channel'] = 1
     item['source'] = u'虎嗅'
     item['source_url'] = response.url if response.url.endswith(
         '/1.html') else '/1.html'.join((response.url, ''))
     # 从文章url中解析出虎嗅文章ID
     huxiu_article_id = self.unbind_variable(
         '/article/(?P<article_id>\\d+)', 'article_id', item['source_url'])
     # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID
     item['newsId'] = self.id_prex * 1000000 + int(huxiu_article_id)
     item['id'] = item['newsId']
     title_list = response.xpath(
         '/html/body//div[@id="page_article"]//h1/text()').extract()
     item['title'] = title_list[0] if title_list else response.xpath(
         '/html/head/title/text()').extract()[0]
     time_str_list = response.xpath(
         '/html/body//div[@id="page_article"]//time[@class="pull-left time"]/text()'
     ).extract()
     time_str = time_str_list[0].strip() if time_str_list else ''
     # 虎嗅时间格式为:2015-06-18 07:55
     item['publishTime'] = int(
         time.
         mktime(datetime.strptime(time_str, '%Y-%m-%d %H:%M').timetuple()
                ) if time_str else time.mktime(datetime.now().timetuple()))
     item['tags'] = response.xpath(
         '/html/body//div[@id="page_article"]//ul[@class="pull-left list-inline tag-box"]/li/a/text()'
     ).extract()
     big_pic_list = response.xpath(
         '/html/body//div[@id="page_article"]//div[@class="big-pic"]/img/@src'
     ).extract()
     if big_pic_list:
         item['picOne'] = big_pic_list[0]
     if len(big_pic_list) > 1:
         item['picTwo'] = big_pic_list[1]
     if len(big_pic_list) > 2:
         item['picThr'] = big_pic_list[2]
     pic_list = response.xpath(
         '/html/body//div[@id="page_article"]//img[contains(@src,"huxiu.com/article/content/")]/@src'
     ).extract()
     item['picListString'] = ','.join(pic_list)
     item['picList'] = pic_list
     item['isLarge'] = True
     return item
예제 #9
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value("country", 'uk')
     article.add_value("language", 'english')
     article.nested_css("div.main-content-column").add_xpath("body", './div/p//text()')
     article.add_xpath("headline", '//head/meta[@property="og:title"]/@content')
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(),
         lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
     )
     article.nested_css('meta[property="article:published_time"]').add_xpath(
         'published_time',
         './@content',
         time_in,
     )
     article.add_xpath("category", '//head/meta[@property="article:section"]/@content')
     article.add_xpath("keywords", '//head/meta[@name="keywords"]/@content')
     article.add_value("url", response.url)
     article.add_value("encoding", response.encoding)
     return article.load_item()
예제 #10
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'uk')
     article.add_value('language', 'english')
     article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content', re=r'(.*) - BBC')
     article.add_value('url', response.url)
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(),
         lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
     )
     article.add_xpath(
         'published_time',
         '//*[@id="responsive-news"]/head/script[1]/text()',
         time_in,
         re=r'"datePublished": "(.*)"',
     )
     article.nested_css('meta[property="article:section"]').add_xpath('category', './@content')
     article.add_value('encoding', response.encoding)
     article.nested_css('div.story-body__inner').add_xpath('body', './p//text()')
     article.nested_css('map-body').add_xpath('body', './p//text()')
     return article.load_item()
예제 #11
0
    def parse_article(self, response):
        article = ItemLoader(item=NewsCrawlerItem(), response=response)
        article.add_value('country', 'usa')
        #article.add_value('country', re.search(r'www\..*\.([a-z]*)/', response.url).group(1))
        article.add_xpath('language', '//html/@lang')
        article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content')
        article.add_value('url', response.url)
        # Function to parse published time to iso6801
        time_in = Compose(
            Join(),
            lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
        )
        article.nested_css('meta[property="article:published_time"]').add_xpath(
            'published_time',
            './@content',
            time_in,
        )
        article.add_xpath('category', '//head/meta[@property="article:section"]/@content', lambda v: v[0].split(',') if v else None)
        article.add_value('encoding', response.encoding)
        article.nested_css('div[class="content-list-component text"]').add_xpath('body', './p//text()')

        return article.load_item()
예제 #12
0
 def parse_article(self, response):
     # 处理文章页面
     item = NewsCrawlerItem()
     item['channel'] = 3
     item['source'] = u'中国娱乐网'
     item['source_url'] = response.url
     # 从文章url中解析出虎扑文章ID
     huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html',
                                             'article_id',
                                             item['source_url'])
     # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID
     item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id)
     item['id'] = item['newsId']
     title_list = response.xpath(
         '/html/body/article/div[@class="articlecontent"]/h1/text()'
     ).extract()
     item['title'] = title_list[0] if title_list else response.xpath(
         '/html/head/title/text()').extract()[0]
     time_str_list = response.xpath(
         '/html/body/article/div[@class="articlecontent"]/div[@class="tm"]/text()'
     ).extract()
     time_str = time_str_list[0].strip() if time_str_list else ''
     # 时间格式为:日期:2015-06-23 来源:
     item['publishTime'] = self.get_time(time_str)
     big_pic_list = response.xpath(
         '/html/body/article/div[@class="articlecontent"]/div[@class="content"]//'
         'img[contains(@src, "http://news.yule.com.cn/")]/@src').extract()
     if big_pic_list:
         item['picOne'] = big_pic_list[0]
     if len(big_pic_list) > 1:
         item['picTwo'] = big_pic_list[1]
     if len(big_pic_list) > 2:
         item['picThr'] = big_pic_list[2]
     pic_list = big_pic_list
     item['picListString'] = ','.join(pic_list)
     item['picList'] = pic_list
     item['isLarge'] = False
     return item
예제 #13
0
    def parse_news_detail_content(self, response):
        
        news_url = response.url
        title = response.xpath('//h1/text()').extract_first()
        author = response.xpath('//div[@class="shareBar__info--author"]/text()').extract_first()
        context_list = response.xpath('//p/text()|//strong/text()').extract()
        img_url = response.xpath('//figure[@class="photo_center photo-story"]//img/@data-src').extract_first()
        post_date = response.xpath('//div[@class="shareBar__info--author"]//span/text()').extract_first()
        create_date = datetime.now()

        context = "".join(context_list)

        news = NewsCrawlerItem(
                    title=title,
                    author=author,
                    context=context,
                    news_url=news_url,
                    img_url=img_url,
                    post_date=post_date,
                    create_date=create_date
                )

        yield news