Exemplo n.º 1
0
 def parse(self, response):
     tr_list = response.xpath('.//table[@id="rank_data"]/tbody/tr') 
     for index, tr in enumerate(tr_list):
         jrcg = JrcgItem()
         jrcg['rank'] = index + 1
         jrcg['title'] = tr.xpath('.//td[1]/span/a/text()').extract_first()
         jrcg['link'] = tr.xpath('.//td[1]/span/a/@href').extract_first()
         jrcg['name'] = self.name 
         yield jrcg
Exemplo n.º 2
0
 def parse(self, response):
     res = json.loads(response.text)
     for index, item in enumerate(res):
         jrcg = JrcgItem()
         jrcg['rank'] = index + 1
         jrcg['title'] = item['title']
         jrcg['link'] = item['url']
         jrcg['count'] = item['replies']
         jrcg['name'] = 'v2ex_zuirezhuti'
         yield jrcg
Exemplo n.º 3
0
 def parse(self, response):
     li_list = response.xpath(
         '//div[@class="index_content_right_league_content"]/ul/li')
     for index, li in enumerate(li_list):
         jrcg = JrcgItem()
         jrcg['rank'] = int(li.xpath('.//span/text()').extract_first())
         jrcg['title'] = li.xpath('.//a/text()').extract_first().strip()
         jrcg['link'] = response.url[0:-1] + li.xpath(
             './/a/@href').extract_first()
         jrcg['name'] = 'meizhuangtoutiao'
         yield jrcg
Exemplo n.º 4
0
 def parse(self, response):
     res = json.loads(response.text)
     item_list = res['data']['bang_topic']['topic_list']
     for index, item in enumerate(item_list):
         jrcg = JrcgItem()
         jrcg['rank'] = index + 1
         jrcg['title'] = item['topic_name']
         jrcg['link'] = item['topic_url']
         jrcg['count'] = item['discuss_num']
         jrcg['name'] = 'baidu_tiebareyibang'
         yield jrcg
Exemplo n.º 5
0
 def parse(self, response):
     res = re.search('{.*}', response.text).group()
     objs = json.loads(res)
     li_list = objs['data']['137170']['list']
     for index, li in enumerate(li_list):
         if index == 0:
             continue
         jrcg = JrcgItem()
         jrcg['rank'] = index
         jrcg['title'] = li['title'].strip('#')
         jrcg['link'] = li['link']
         jrcg['name'] = 'mogujie'
         yield jrcg
Exemplo n.º 6
0
 def parse(self, response):
     tr_list = response.xpath("//*[@id='pl_top_realtimehot']/table/tbody/tr")
     for index, tr in enumerate(tr_list):
         jrcg = JrcgItem()
         if index == 0:
             jrcg['rank'] = 0
         else:
             jrcg['rank'] = int(tr.xpath(".//td[position()=1]/text()").extract_first(default = 0))
         jrcg['title'] = tr.xpath(".//td[position()=2]/a/text()").extract_first(default = '-')
         jrcg['link'] = "https://s.weibo.com" + tr.xpath(".//td[position()=2]/a/@href").extract_first(default = '-')
         jrcg['count'] = int(tr.xpath(".//td[position()=2]/span/text()").extract_first(default = 0))
         jrcg['state'] = tr.xpath(".//td[position()=3]/i/text()").extract_first(default = '')
         jrcg['name'] = 'weibo'
         yield jrcg
Exemplo n.º 7
0
 def parse(self, response):
     li_list_str = response.xpath(
         './/div[@id="list-hotposts"]/ol/script/text()').extract_first(
         ).strip()
     li_list_str = re.search(r'\'(.*)\'', li_list_str).group(1)
     li_list_str = urllib.parse.unquote(li_list_str)
     li_list = Selector(text=li_list_str).xpath('.//li')
     for index, li in enumerate(li_list):
         jrcg = JrcgItem()
         jrcg['rank'] = index + 1
         jrcg['title'] = li.xpath('./a/text()').extract_first()
         jrcg['link'] = li.xpath('./a/@href').extract_first()
         jrcg['name'] = self.name
         yield jrcg
Exemplo n.º 8
0
 def parse(self, response):
     div_list = response.xpath('.//div[@class="hotlist-main"]/div')
     for index, div in enumerate(div_list):
         index += 1
         jrcg = JrcgItem()
         jrcg['rank'] = index
         if index in (1, 2):
             jrcg['title'] = div.xpath('./a[2]/p/text()').extract_first()
             jrcg['link'] = response.url[0:-1] + div.xpath(
                 './a[2]/@href').extract_first()
         else:
             jrcg['title'] = div.xpath('./div[2]/a/text()').extract_first()
             jrcg['link'] = div.xpath('./div[2]/a/@href').extract_first()
         jrcg['name'] = self.name
         yield jrcg
Exemplo n.º 9
0
 def parse(self, response):
     response = response.replace(encoding='gb18030')
     trlist = response.xpath('//table[@class="list-table"]//tr')
     for index, tr in enumerate(trlist):
         if index in (0, 2, 4, 6):
             continue
         jrcg = JrcgItem()
         jrcg['rank'] = tr.xpath(
             './/td[@class="first"]/span/text()').extract_first()
         jrcg['title'] = tr.xpath(
             './/td[@class="keyword"]/a/text()').extract_first()
         jrcg['link'] = tr.xpath(
             './/td[@class="keyword"]/a/@href').extract_first()
         jrcg['count'] = tr.xpath(
             './/td[@class="last"]/span/text()').extract_first()
         jrcg['name'] = 'baidu_shishirebang'
         yield jrcg
Exemplo n.º 10
0
 def parse(self, response):
     li_list = response.xpath('.//ul[@class="rank-list"]/li')
     for li in li_list:
         jrcg = JrcgItem()
         jrcg['rank'] = li.xpath(
             './div[@class="num"]/text()').extract_first()
         jrcg['title'] = li.xpath(
             './div[@class="content"]/div[@class="info"]/a/text()'
         ).extract_first()
         jrcg['link'] = 'https:' + li.xpath(
             './div[@class="content"]/div[@class="info"]/a/@href'
         ).extract_first()
         jrcg['count'] = li.xpath(
             './div[@class="content"]/div[@class="info"]/div[@class="detail"]/span/text()'
         ).extract_first()
         jrcg['name'] = self.name
         yield jrcg
Exemplo n.º 11
0
 def parse(self, response):
     li_list = response.xpath("//ul[@class='pub-list']/li")
     for index, li in enumerate(li_list):
         jrcg = JrcgItem()
         if index in (0, 1, 2):
             jrcg['rank'] = li.xpath(
                 ".//span[@class='s1']/a/i/text()").extract_first()
         else:
             jrcg['rank'] = li.xpath(
                 ".//span[@class='s1']/i/text()").extract_first()
         jrcg['title'] = li.xpath(
             ".//span[@class='s2']/p/a/text()").extract_first()
         jrcg['link'] = li.xpath(
             ".//span[@class='s2']/p/a/@href").extract_first()
         jrcg['count'] = li.xpath(
             ".//span[@class='s3']/text()").extract_first()
         jrcg['name'] = 'sogou_shishiredian'
         yield jrcg