Exemplo n.º 1
0
    def parse(self, response):

        top_list = response.xpath('//tr[@action-type="hover"]')
        for top in top_list:
            hot_search_news_item = hot_search_newsItem()
            top_name = "新浪热搜"
            top_keyword_rank = top.xpath(
                './/span[@class="search_icon_rankntop"]/em/text() | .//span[@class="search_icon_rankn"]/em/text()'
            ).extract_first("")
            top_keyword = top.xpath(
                './/p[@class="star_name"]/a/text()').extract_first("")
            top_keyword_url = top.xpath(
                './/p[@class="star_name"]/a/@href').extract_first("")
            top_keyword_type = top.xpath(
                './/p[@class="star_name"]/i/text()').extract_first("")
            if top_keyword_type == "荐":
                top_keyword_url = top.xpath(
                    './/p[@class="star_name"]/a/@href_to').extract_first("")
            top_keyword_pop = top.xpath(
                './/p[@class="star_num"]/span/text()').extract_first("")
            hot_search_news_item['title'] = top_keyword
            hot_search_news_item['desc'] = ""
            hot_search_news_item['search_index'] = top_keyword_pop
            hot_search_news_item['create_time'] = datetime.datetime.now(
            ).strftime("%Y-%m-%d %H:%M:%S")
            hot_search_news_item['news_origin'] = top_name
            hot_search_news_item['origin_type'] = "热搜榜"
            hot_search_news_item['message_type'] = top_keyword_type
            hot_search_news_item['message_trend'] = ""
            hot_search_news_item[
                'message_url'] = "http://s.weibo.com" + top_keyword_url
            hot_search_news_item['ranking'] = top_keyword_rank

            yield hot_search_news_item
Exemplo n.º 2
0
 def parse(self, response):
     top_list = response.xpath('//div[@class="data"]//tbody/tr')
     for top in top_list:
         top_keyword_rank = top.xpath(
             './td[contains(@class,"ranktop")]//text()').extract_first("")
         if top_keyword_rank:
             top_keyword = top.xpath(
                 './td[@class="td-02"]/a/text()').extract_first("")
             top_keyword_url = top.xpath(
                 './td[@class="td-02"]/a/@href').extract_first("")
             top_keyword_pop = top.xpath(
                 './td[@class="td-03"]//text()').extract_first("")
             top_keyword_type = top.xpath(
                 './td[@class="td-02"]/i/text()').extract_first("")
             top_name = "新浪热搜"
             hot_search_news_item = hot_search_newsItem()
             hot_search_news_item['title'] = top_keyword
             hot_search_news_item['desc'] = ""
             hot_search_news_item['search_index'] = top_keyword_pop
             hot_search_news_item['create_time'] = datetime.datetime.now(
             ).strftime("%Y-%m-%d %H:%M:%S")
             hot_search_news_item['news_origin'] = top_name
             hot_search_news_item['origin_type'] = "热搜榜"
             hot_search_news_item['message_type'] = top_keyword_type
             hot_search_news_item['message_trend'] = ""
             hot_search_news_item[
                 'message_url'] = "http://s.weibo.com" + top_keyword_url
             hot_search_news_item['ranking'] = top_keyword_rank
             yield hot_search_news_item
Exemplo n.º 3
0
 def parse_toplist(self, response):
     top_keywords_list = response.xpath('//table[@class="list-table"]/tr')
     for top_keyword in top_keywords_list:
         hot_search_news_item = hot_search_newsItem()
         if top_keyword.xpath(
                 './td[@class="keyword"]/a[1]/text()').extract_first(""):
             # 排名
             top_keyword_rank = top_keyword.xpath(
                 './td[@class="first"]/span/text()').extract_first("")
             # 关键词
             top_keyword_title = top_keyword.xpath(
                 './td[@class="keyword"]/a[1]/text()').extract_first("")
             # 相关网址
             top_keyword_url = top_keyword.xpath(
                 './td[@class="keyword"]/a[1]/@href').extract_first("")
             # 新闻
             info = top_keyword.xpath(
                 './td[@class="tc"]/a[1]/@href').extract_first("")
             # 搜索指数
             top_keyword_pop = top_keyword.xpath(
                 './td[@class="last"]/span/text()').extract_first("")
             # 词类型
             top_keyword_type = top_keyword.xpath(
                 './td[@class="keyword"]/span/@class').extract_first("")
             #  趋势
             top_keyword_trend = top_keyword.xpath(
                 './td[@class="last"]/span/@class').extract_first("")
             if top_keyword_trend:
                 if top_keyword_trend == "icon-fall":
                     top_keyword_trend = "down"
                 elif top_keyword_trend == "icon-fair":
                     top_keyword_trend = "fair"
                 else:
                     top_keyword_trend = "up"
             if top_keyword_type:
                 top_keyword_type = "新"
             hot_search_news_item['origin_type'] = response.meta.get(
                 "top_name", "")
             hot_search_news_item['title'] = top_keyword_title
             hot_search_news_item['desc'] = ""
             hot_search_news_item['news_origin'] = "百度热搜"
             hot_search_news_item['create_time'] = datetime.datetime.now(
             ).strftime("%Y-%m-%d %H:%M:%S")
             hot_search_news_item['message_url'] = top_keyword_url
             hot_search_news_item['ranking'] = top_keyword_rank
             hot_search_news_item['search_index'] = top_keyword_pop
             hot_search_news_item['message_type'] = top_keyword_type
             hot_search_news_item['message_trend'] = top_keyword_trend
             yield hot_search_news_item
Exemplo n.º 4
0
    def parse(self, response):
        news_list = response.xpath('//ul[@class="pub-list"]/li')
        message_type = response.xpath('//div[@class="snb"]/a[@class="cur"]/text()').extract_first("")
        for news in news_list:
            hot_search_news_item = hot_search_newsItem()
            ranking = news.xpath('./span[@class="s1"]//i/text()').extract_first("")
            title = news.xpath('./span[@class="s2"]/p/a/text()').extract_first("")
            message_url = news.xpath('./span[@class="s2"]/p[1]/a/@href').extract_first("")
            news_desc = news.xpath('./span[@class="s2"]/p[2]/text()').extract_first("")
            search_index = news.xpath('./span[@class="s3"]/text()').extract_first("")

            hot_search_news_item['title'] = title
            hot_search_news_item['desc'] = news_desc
            hot_search_news_item['news_origin'] = "搜狗热搜"
            hot_search_news_item['create_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            hot_search_news_item['message_url'] = message_url
            hot_search_news_item['ranking'] = ranking
            hot_search_news_item['message_type'] = message_type
            hot_search_news_item['search_index'] = search_index

            yield hot_search_news_item
Exemplo n.º 5
0
    def parse(self, response):
        news_list = response.xpath('//ul[@class="pub-list"]/li')
        origin_type = response.xpath('//div[@class="snb"]/a[@class="cur"]/text()').extract_first("")
        for news in news_list:
            hot_search_news_item = hot_search_newsItem()
            ranking = news.xpath('./span[@class="s1"]//i/text()').extract_first("")
            title = news.xpath('./span[@class="s2"]/p/a/text()').extract_first("")
            message_url = news.xpath('./span[@class="s2"]/p[1]/a/@href').extract_first("")
            news_desc = news.xpath('./span[@class="s2"]/p[2]/text()').extract_first("")
            search_index = news.xpath('./span[@class="s3"]/text()').extract_first("")
            message_trend = news.xpath('./span[@class="s3"]/i/@class').extract_first("")

            hot_search_news_item['title'] = title
            hot_search_news_item['desc'] = news_desc
            hot_search_news_item['news_origin'] = "搜狗热搜"
            hot_search_news_item['create_time'] = response.meta.get("create_time")
            hot_search_news_item['message_url'] = message_url
            hot_search_news_item['ranking'] = ranking
            hot_search_news_item['origin_type'] = origin_type
            hot_search_news_item['search_index'] = search_index
            hot_search_news_item['message_type'] = ""
            hot_search_news_item['message_trend'] = message_trend

            yield hot_search_news_item