def parse(self, response): tr_list = response.xpath('.//table[@id="rank_data"]/tbody/tr') for index, tr in enumerate(tr_list): jrcg = JrcgItem() jrcg['rank'] = index + 1 jrcg['title'] = tr.xpath('.//td[1]/span/a/text()').extract_first() jrcg['link'] = tr.xpath('.//td[1]/span/a/@href').extract_first() jrcg['name'] = self.name yield jrcg
def parse(self, response): res = json.loads(response.text) for index, item in enumerate(res): jrcg = JrcgItem() jrcg['rank'] = index + 1 jrcg['title'] = item['title'] jrcg['link'] = item['url'] jrcg['count'] = item['replies'] jrcg['name'] = 'v2ex_zuirezhuti' yield jrcg
def parse(self, response): li_list = response.xpath( '//div[@class="index_content_right_league_content"]/ul/li') for index, li in enumerate(li_list): jrcg = JrcgItem() jrcg['rank'] = int(li.xpath('.//span/text()').extract_first()) jrcg['title'] = li.xpath('.//a/text()').extract_first().strip() jrcg['link'] = response.url[0:-1] + li.xpath( './/a/@href').extract_first() jrcg['name'] = 'meizhuangtoutiao' yield jrcg
def parse(self, response): res = json.loads(response.text) item_list = res['data']['bang_topic']['topic_list'] for index, item in enumerate(item_list): jrcg = JrcgItem() jrcg['rank'] = index + 1 jrcg['title'] = item['topic_name'] jrcg['link'] = item['topic_url'] jrcg['count'] = item['discuss_num'] jrcg['name'] = 'baidu_tiebareyibang' yield jrcg
def parse(self, response): res = re.search('{.*}', response.text).group() objs = json.loads(res) li_list = objs['data']['137170']['list'] for index, li in enumerate(li_list): if index == 0: continue jrcg = JrcgItem() jrcg['rank'] = index jrcg['title'] = li['title'].strip('#') jrcg['link'] = li['link'] jrcg['name'] = 'mogujie' yield jrcg
def parse(self, response): tr_list = response.xpath("//*[@id='pl_top_realtimehot']/table/tbody/tr") for index, tr in enumerate(tr_list): jrcg = JrcgItem() if index == 0: jrcg['rank'] = 0 else: jrcg['rank'] = int(tr.xpath(".//td[position()=1]/text()").extract_first(default = 0)) jrcg['title'] = tr.xpath(".//td[position()=2]/a/text()").extract_first(default = '-') jrcg['link'] = "https://s.weibo.com" + tr.xpath(".//td[position()=2]/a/@href").extract_first(default = '-') jrcg['count'] = int(tr.xpath(".//td[position()=2]/span/text()").extract_first(default = 0)) jrcg['state'] = tr.xpath(".//td[position()=3]/i/text()").extract_first(default = '') jrcg['name'] = 'weibo' yield jrcg
def parse(self, response): li_list_str = response.xpath( './/div[@id="list-hotposts"]/ol/script/text()').extract_first( ).strip() li_list_str = re.search(r'\'(.*)\'', li_list_str).group(1) li_list_str = urllib.parse.unquote(li_list_str) li_list = Selector(text=li_list_str).xpath('.//li') for index, li in enumerate(li_list): jrcg = JrcgItem() jrcg['rank'] = index + 1 jrcg['title'] = li.xpath('./a/text()').extract_first() jrcg['link'] = li.xpath('./a/@href').extract_first() jrcg['name'] = self.name yield jrcg
def parse(self, response): div_list = response.xpath('.//div[@class="hotlist-main"]/div') for index, div in enumerate(div_list): index += 1 jrcg = JrcgItem() jrcg['rank'] = index if index in (1, 2): jrcg['title'] = div.xpath('./a[2]/p/text()').extract_first() jrcg['link'] = response.url[0:-1] + div.xpath( './a[2]/@href').extract_first() else: jrcg['title'] = div.xpath('./div[2]/a/text()').extract_first() jrcg['link'] = div.xpath('./div[2]/a/@href').extract_first() jrcg['name'] = self.name yield jrcg
def parse(self, response): response = response.replace(encoding='gb18030') trlist = response.xpath('//table[@class="list-table"]//tr') for index, tr in enumerate(trlist): if index in (0, 2, 4, 6): continue jrcg = JrcgItem() jrcg['rank'] = tr.xpath( './/td[@class="first"]/span/text()').extract_first() jrcg['title'] = tr.xpath( './/td[@class="keyword"]/a/text()').extract_first() jrcg['link'] = tr.xpath( './/td[@class="keyword"]/a/@href').extract_first() jrcg['count'] = tr.xpath( './/td[@class="last"]/span/text()').extract_first() jrcg['name'] = 'baidu_shishirebang' yield jrcg
def parse(self, response): li_list = response.xpath('.//ul[@class="rank-list"]/li') for li in li_list: jrcg = JrcgItem() jrcg['rank'] = li.xpath( './div[@class="num"]/text()').extract_first() jrcg['title'] = li.xpath( './div[@class="content"]/div[@class="info"]/a/text()' ).extract_first() jrcg['link'] = 'https:' + li.xpath( './div[@class="content"]/div[@class="info"]/a/@href' ).extract_first() jrcg['count'] = li.xpath( './div[@class="content"]/div[@class="info"]/div[@class="detail"]/span/text()' ).extract_first() jrcg['name'] = self.name yield jrcg
def parse(self, response): li_list = response.xpath("//ul[@class='pub-list']/li") for index, li in enumerate(li_list): jrcg = JrcgItem() if index in (0, 1, 2): jrcg['rank'] = li.xpath( ".//span[@class='s1']/a/i/text()").extract_first() else: jrcg['rank'] = li.xpath( ".//span[@class='s1']/i/text()").extract_first() jrcg['title'] = li.xpath( ".//span[@class='s2']/p/a/text()").extract_first() jrcg['link'] = li.xpath( ".//span[@class='s2']/p/a/@href").extract_first() jrcg['count'] = li.xpath( ".//span[@class='s3']/text()").extract_first() jrcg['name'] = 'sogou_shishiredian' yield jrcg