def parse(self, response): top_list = response.xpath('//tr[@action-type="hover"]') for top in top_list: hot_search_news_item = hot_search_newsItem() top_name = "新浪热搜" top_keyword_rank = top.xpath( './/span[@class="search_icon_rankntop"]/em/text() | .//span[@class="search_icon_rankn"]/em/text()' ).extract_first("") top_keyword = top.xpath( './/p[@class="star_name"]/a/text()').extract_first("") top_keyword_url = top.xpath( './/p[@class="star_name"]/a/@href').extract_first("") top_keyword_type = top.xpath( './/p[@class="star_name"]/i/text()').extract_first("") if top_keyword_type == "荐": top_keyword_url = top.xpath( './/p[@class="star_name"]/a/@href_to').extract_first("") top_keyword_pop = top.xpath( './/p[@class="star_num"]/span/text()').extract_first("") hot_search_news_item['title'] = top_keyword hot_search_news_item['desc'] = "" hot_search_news_item['search_index'] = top_keyword_pop hot_search_news_item['create_time'] = datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") hot_search_news_item['news_origin'] = top_name hot_search_news_item['origin_type'] = "热搜榜" hot_search_news_item['message_type'] = top_keyword_type hot_search_news_item['message_trend'] = "" hot_search_news_item[ 'message_url'] = "http://s.weibo.com" + top_keyword_url hot_search_news_item['ranking'] = top_keyword_rank yield hot_search_news_item
def parse(self, response): top_list = response.xpath('//div[@class="data"]//tbody/tr') for top in top_list: top_keyword_rank = top.xpath( './td[contains(@class,"ranktop")]//text()').extract_first("") if top_keyword_rank: top_keyword = top.xpath( './td[@class="td-02"]/a/text()').extract_first("") top_keyword_url = top.xpath( './td[@class="td-02"]/a/@href').extract_first("") top_keyword_pop = top.xpath( './td[@class="td-03"]//text()').extract_first("") top_keyword_type = top.xpath( './td[@class="td-02"]/i/text()').extract_first("") top_name = "新浪热搜" hot_search_news_item = hot_search_newsItem() hot_search_news_item['title'] = top_keyword hot_search_news_item['desc'] = "" hot_search_news_item['search_index'] = top_keyword_pop hot_search_news_item['create_time'] = datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") hot_search_news_item['news_origin'] = top_name hot_search_news_item['origin_type'] = "热搜榜" hot_search_news_item['message_type'] = top_keyword_type hot_search_news_item['message_trend'] = "" hot_search_news_item[ 'message_url'] = "http://s.weibo.com" + top_keyword_url hot_search_news_item['ranking'] = top_keyword_rank yield hot_search_news_item
def parse_toplist(self, response): top_keywords_list = response.xpath('//table[@class="list-table"]/tr') for top_keyword in top_keywords_list: hot_search_news_item = hot_search_newsItem() if top_keyword.xpath( './td[@class="keyword"]/a[1]/text()').extract_first(""): # 排名 top_keyword_rank = top_keyword.xpath( './td[@class="first"]/span/text()').extract_first("") # 关键词 top_keyword_title = top_keyword.xpath( './td[@class="keyword"]/a[1]/text()').extract_first("") # 相关网址 top_keyword_url = top_keyword.xpath( './td[@class="keyword"]/a[1]/@href').extract_first("") # 新闻 info = top_keyword.xpath( './td[@class="tc"]/a[1]/@href').extract_first("") # 搜索指数 top_keyword_pop = top_keyword.xpath( './td[@class="last"]/span/text()').extract_first("") # 词类型 top_keyword_type = top_keyword.xpath( './td[@class="keyword"]/span/@class').extract_first("") # 趋势 top_keyword_trend = top_keyword.xpath( './td[@class="last"]/span/@class').extract_first("") if top_keyword_trend: if top_keyword_trend == "icon-fall": top_keyword_trend = "down" elif top_keyword_trend == "icon-fair": top_keyword_trend = "fair" else: top_keyword_trend = "up" if top_keyword_type: top_keyword_type = "新" hot_search_news_item['origin_type'] = response.meta.get( "top_name", "") hot_search_news_item['title'] = top_keyword_title hot_search_news_item['desc'] = "" hot_search_news_item['news_origin'] = "百度热搜" hot_search_news_item['create_time'] = datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") hot_search_news_item['message_url'] = top_keyword_url hot_search_news_item['ranking'] = top_keyword_rank hot_search_news_item['search_index'] = top_keyword_pop hot_search_news_item['message_type'] = top_keyword_type hot_search_news_item['message_trend'] = top_keyword_trend yield hot_search_news_item
def parse(self, response): news_list = response.xpath('//ul[@class="pub-list"]/li') message_type = response.xpath('//div[@class="snb"]/a[@class="cur"]/text()').extract_first("") for news in news_list: hot_search_news_item = hot_search_newsItem() ranking = news.xpath('./span[@class="s1"]//i/text()').extract_first("") title = news.xpath('./span[@class="s2"]/p/a/text()').extract_first("") message_url = news.xpath('./span[@class="s2"]/p[1]/a/@href').extract_first("") news_desc = news.xpath('./span[@class="s2"]/p[2]/text()').extract_first("") search_index = news.xpath('./span[@class="s3"]/text()').extract_first("") hot_search_news_item['title'] = title hot_search_news_item['desc'] = news_desc hot_search_news_item['news_origin'] = "搜狗热搜" hot_search_news_item['create_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hot_search_news_item['message_url'] = message_url hot_search_news_item['ranking'] = ranking hot_search_news_item['message_type'] = message_type hot_search_news_item['search_index'] = search_index yield hot_search_news_item
def parse(self, response): news_list = response.xpath('//ul[@class="pub-list"]/li') origin_type = response.xpath('//div[@class="snb"]/a[@class="cur"]/text()').extract_first("") for news in news_list: hot_search_news_item = hot_search_newsItem() ranking = news.xpath('./span[@class="s1"]//i/text()').extract_first("") title = news.xpath('./span[@class="s2"]/p/a/text()').extract_first("") message_url = news.xpath('./span[@class="s2"]/p[1]/a/@href').extract_first("") news_desc = news.xpath('./span[@class="s2"]/p[2]/text()').extract_first("") search_index = news.xpath('./span[@class="s3"]/text()').extract_first("") message_trend = news.xpath('./span[@class="s3"]/i/@class').extract_first("") hot_search_news_item['title'] = title hot_search_news_item['desc'] = news_desc hot_search_news_item['news_origin'] = "搜狗热搜" hot_search_news_item['create_time'] = response.meta.get("create_time") hot_search_news_item['message_url'] = message_url hot_search_news_item['ranking'] = ranking hot_search_news_item['origin_type'] = origin_type hot_search_news_item['search_index'] = search_index hot_search_news_item['message_type'] = "" hot_search_news_item['message_trend'] = message_trend yield hot_search_news_item