def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'usa') article.add_xpath('language', '//html/@lang') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')) article.nested_css( 'meta[property="og:article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('keywords', '//head/meta[@property="keywords"]/@content', lambda v: v[0].split(',') if v else None) article.add_value('encoding', response.encoding) article.nested_css('span[id="article-text"]').add_xpath( 'body', './p//text()') return article.load_item()
def parse_article(self, response): # 处理文章页面 item = NewsCrawlerItem() item['channel'] = 4 item['source'] = u'汽车之家' item['source_url'] = response.url # 从文章url中解析出虎扑文章ID huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html', 'article_id', item['source_url']) # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id) item['id'] = item['newsId'] title_list = response.xpath('//div[@class="article-title"]/h1/text()').extract() item['title'] = title_list[0] if title_list else response.xpath('/html/head/title/text()').extract()[0] time_str_list = response.xpath('//div[@class="article-title"]/p/text()').extract() time_str = time_str_list[0].strip() if time_str_list else '' # 汽车之家时间格式为:类型:原创 2015-07-05 汽车之家 编辑: item['publishTime'] = self.get_time(time_str) big_pic_list = response.xpath( '//div[@class="article-content"]/p/img[contains(@data-src, ".autoimg.cn/")]/@data-src').extract() if big_pic_list: item['picOne'] = big_pic_list[0] if len(big_pic_list) > 1: item['picTwo'] = big_pic_list[1] if len(big_pic_list) > 2: item['picThr'] = big_pic_list[2] pic_list = big_pic_list item['picListString'] = ','.join(pic_list) item['picList'] = pic_list item['isLarge'] = True return item
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'middle east') article.add_value('language', 'english') article.add_value('stopwords', ['al', 'jazeera']) article.nested_css('div.article-body').add_xpath('body', './p//text()') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') # Function to parse published time to iso6801 published_time_in = Compose( Join(), lambda v: '' if (datetime.strptime( v, '%a, %d %B %Y %H:%M:%S %Z') is None) else datetime.strptime( v, '%a, %d %B %Y %H:%M:%S %Z').isoformat(sep='T')) article.nested_css('meta[name="LastModifiedDate"]').add_xpath( 'published_time', './@content', published_time_in, ) article.nested_css('span.article-topics').add_xpath( 'category', './/text()') article.nested_css('meta[property="ContentType"]').add_xpath( 'category', './@content') article.add_value('url', response.url) article.add_value('encoding', response.encoding) return article.load_item()
def parse_article(self, response): # 处理文章页面 item = NewsCrawlerItem() item['channel'] = 2 item['source'] = u'虎扑' item['source_url'] = response.url # 从文章url中解析出虎扑文章ID huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html', 'article_id', item['source_url']) # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id) item['id'] = item['newsId'] title_list = response.xpath('//header[@class="artical-title"]//h1[@class="headline"]/text()').extract() item['title'] = title_list[0] if title_list else response.xpath('/html/head/title/text()').extract()[0] time_str_list = response.xpath( '//header[@class="artical-title"]//span[@class="times"]/text()').extract() time_str = time_str_list[0].strip() if time_str_list else '' # 虎嗅时间格式为:48分钟前, 5小时前, 07-03 19:24 item['publishTime'] = self.get_hupu_time(time_str) item['tags'] = response.xpath( '//section[@class="detail-content"]//div[contains(@class, "hot-tags")]//div[@class="swiper-slide"]/a/text()').extract() big_pic_list = response.xpath( '//section[@class="detail-content"]/article[@class="article-content"]/img/@src').extract() if big_pic_list: item['picOne'] = big_pic_list[0] if len(big_pic_list) > 1: item['picTwo'] = big_pic_list[1] if len(big_pic_list) > 2: item['picThr'] = big_pic_list[2] pic_list = big_pic_list item['picListString'] = ','.join(pic_list) item['picList'] = pic_list item['isLarge'] = True return item
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'usa') article.add_value('language', 'english') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')) article.nested_css('meta[name="pubdate"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('category', '//head/meta[@name="section"]/@content') article.add_xpath('keywords', '//head/meta[@itemprop="keywords"]/@content', re=r'(.*) -') article.add_value('encoding', response.encoding) article.nested_css('div.pg-rail-tall__body').nested_css( 'div.l-container').add_xpath( 'body', './/div[re:test(@class, "zn-.*")]/text()') return article.load_item()
def parse_item(self, response): # item pipeline: add another to filter out one with too short of a body, or a media source (vid) item = NewsCrawlerItem() item["url"] = response.url item["content"] = response.body yield item
def parse(self, response): for row in response.xpath( r"//div[@class='col-md-6 further-news-container latest-news-padding']" ): l = ItemLoader(item=NewsCrawlerItem(), selector=row) l.add_xpath( "DateTime", r"div/div/span[@class='warmGrey source-and-publishdate']/text()[2]" ) l.add_xpath("News", r"div/div/a[@class='news-link']/text()") yield l.load_item()
def parse_article(self, response): # 处理文章页面 item = NewsCrawlerItem() item['channel'] = 1 item['source'] = u'虎嗅' item['source_url'] = response.url if response.url.endswith( '/1.html') else '/1.html'.join((response.url, '')) # 从文章url中解析出虎嗅文章ID huxiu_article_id = self.unbind_variable( '/article/(?P<article_id>\\d+)', 'article_id', item['source_url']) # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID item['newsId'] = self.id_prex * 1000000 + int(huxiu_article_id) item['id'] = item['newsId'] title_list = response.xpath( '/html/body//div[@id="page_article"]//h1/text()').extract() item['title'] = title_list[0] if title_list else response.xpath( '/html/head/title/text()').extract()[0] time_str_list = response.xpath( '/html/body//div[@id="page_article"]//time[@class="pull-left time"]/text()' ).extract() time_str = time_str_list[0].strip() if time_str_list else '' # 虎嗅时间格式为:2015-06-18 07:55 item['publishTime'] = int( time. mktime(datetime.strptime(time_str, '%Y-%m-%d %H:%M').timetuple() ) if time_str else time.mktime(datetime.now().timetuple())) item['tags'] = response.xpath( '/html/body//div[@id="page_article"]//ul[@class="pull-left list-inline tag-box"]/li/a/text()' ).extract() big_pic_list = response.xpath( '/html/body//div[@id="page_article"]//div[@class="big-pic"]/img/@src' ).extract() if big_pic_list: item['picOne'] = big_pic_list[0] if len(big_pic_list) > 1: item['picTwo'] = big_pic_list[1] if len(big_pic_list) > 2: item['picThr'] = big_pic_list[2] pic_list = response.xpath( '/html/body//div[@id="page_article"]//img[contains(@src,"huxiu.com/article/content/")]/@src' ).extract() item['picListString'] = ','.join(pic_list) item['picList'] = pic_list item['isLarge'] = True return item
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value("country", 'uk') article.add_value("language", 'english') article.nested_css("div.main-content-column").add_xpath("body", './div/p//text()') article.add_xpath("headline", '//head/meta[@property="og:title"]/@content') # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.nested_css('meta[property="article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath("category", '//head/meta[@property="article:section"]/@content') article.add_xpath("keywords", '//head/meta[@name="keywords"]/@content') article.add_value("url", response.url) article.add_value("encoding", response.encoding) return article.load_item()
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'uk') article.add_value('language', 'english') article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content', re=r'(.*) - BBC') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.add_xpath( 'published_time', '//*[@id="responsive-news"]/head/script[1]/text()', time_in, re=r'"datePublished": "(.*)"', ) article.nested_css('meta[property="article:section"]').add_xpath('category', './@content') article.add_value('encoding', response.encoding) article.nested_css('div.story-body__inner').add_xpath('body', './p//text()') article.nested_css('map-body').add_xpath('body', './p//text()') return article.load_item()
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'usa') #article.add_value('country', re.search(r'www\..*\.([a-z]*)/', response.url).group(1)) article.add_xpath('language', '//html/@lang') article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.nested_css('meta[property="article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('category', '//head/meta[@property="article:section"]/@content', lambda v: v[0].split(',') if v else None) article.add_value('encoding', response.encoding) article.nested_css('div[class="content-list-component text"]').add_xpath('body', './p//text()') return article.load_item()
def parse_article(self, response): # 处理文章页面 item = NewsCrawlerItem() item['channel'] = 3 item['source'] = u'中国娱乐网' item['source_url'] = response.url # 从文章url中解析出虎扑文章ID huxiu_article_id = self.unbind_variable('/(?P<article_id>\\d+).html', 'article_id', item['source_url']) # 文章的ID用 虎嗅媒体的ID+虎嗅文章ID item['newsId'] = self.id_prex * 10000000 + int(huxiu_article_id) item['id'] = item['newsId'] title_list = response.xpath( '/html/body/article/div[@class="articlecontent"]/h1/text()' ).extract() item['title'] = title_list[0] if title_list else response.xpath( '/html/head/title/text()').extract()[0] time_str_list = response.xpath( '/html/body/article/div[@class="articlecontent"]/div[@class="tm"]/text()' ).extract() time_str = time_str_list[0].strip() if time_str_list else '' # 时间格式为:日期:2015-06-23 来源: item['publishTime'] = self.get_time(time_str) big_pic_list = response.xpath( '/html/body/article/div[@class="articlecontent"]/div[@class="content"]//' 'img[contains(@src, "http://news.yule.com.cn/")]/@src').extract() if big_pic_list: item['picOne'] = big_pic_list[0] if len(big_pic_list) > 1: item['picTwo'] = big_pic_list[1] if len(big_pic_list) > 2: item['picThr'] = big_pic_list[2] pic_list = big_pic_list item['picListString'] = ','.join(pic_list) item['picList'] = pic_list item['isLarge'] = False return item
def parse_news_detail_content(self, response): news_url = response.url title = response.xpath('//h1/text()').extract_first() author = response.xpath('//div[@class="shareBar__info--author"]/text()').extract_first() context_list = response.xpath('//p/text()|//strong/text()').extract() img_url = response.xpath('//figure[@class="photo_center photo-story"]//img/@data-src').extract_first() post_date = response.xpath('//div[@class="shareBar__info--author"]//span/text()').extract_first() create_date = datetime.now() context = "".join(context_list) news = NewsCrawlerItem( title=title, author=author, context=context, news_url=news_url, img_url=img_url, post_date=post_date, create_date=create_date ) yield news