def parse(self, response): it = FirstItem() it["content"] = response.xpath( '//div[@class="content"]/span/text()').extract() it["link"] = response.xpath( '//a[@class="contentHerf"]/@href').extract() yield it
def parse(self, response): it = FirstItem() it["content"] = response.xpath( "//div[@class='content']/span/text()").extract() it["link"] = response.xpath( "//a[@class='contentHerf']/@href").extract() yield it
def parse_list(self, response): json_str = '' try: json_str = response.body.split( '<product-frontend-list :products="')[1].split( '" :per-page="36"></product-frontend-list>')[0] except Exception as err: print err if json_str != '': json_str = self.validateStr(json_str) element_list = json.loads(json_str) for element in element_list['data']: item = FirstItem() item['sku'] = element['sku'] item['category'] = '' if 'Fabric' in element['sku']: item['category'] = 'Fabric' if 'Wallpaper' in element['sku']: item['category'] = 'Wallpaper' item['name'] = element['name'] item['price'] = element['our_price']['decimal'] yield item
def parse(self, response): #爬虫正文部分 it = FirstItem() it["content"] = response.xpath( "//a[@class='recmd-content']/text()").extract() #利用xpath来筛选文本信息。 it["link"] = response.xpath( "//a[@class='recmd-content']/@href").extract() #利用xpath来筛选链接信息。 yield it
def parse(self, response): for each in response.xpath( "//tr[@class='evenrow'] | //tr[@class='oddrow']"): # 初始化模型对象 item = FirstItem() item['runid'] = each.xpath("./td[1]/text()").extract()[0] item['username'] = each.xpath("./td[2]/a/text()").extract()[0] item['problem_id'] = each.xpath( "./td[3]/div/a/text()").extract()[0] item['result'] = each.xpath("./td[4]/a/text()").extract()[0] item['memory'] = each.xpath("./td[5]/div/text()").extract()[0] item['timer'] = each.xpath("./td[6]/div/text()").extract()[0] item['lang'] = each.xpath("./td[7]/text()").extract()[0] item['codel'] = each.xpath("./td[8]/text()").extract()[0] item['sub_time'] = each.xpath("./td[9]/text()").extract()[0] yield item if self.offset < 1809510: self.offset += 99 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): List_1 = [] item = FirstItem() item["name"] = "qzy" item["money"] = "12345" List_1.append(item) return List_1
def parse(self, response): item = FirstItem() if json.loads(response.text)['pages'] == 0: pass else: item['dm'] = response.url[len(response.url) - 6:] item['data'] = json.loads(response.text)['data'] return item
def parse(self, response): ''' 固定格式 content = response.xpath("").extract() ''' item = FirstItem() item['content'] = response.xpath("/html/head/title/text()").extract() yield item
def parse(self, response): print("##############") #print(response.body.decode("utf-8")) for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = FirstItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = "https://hr.tencent.com/" + each.xpath( './td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] yield item #返回给pipelines process_item
def parse(self, response): for line in response.xpath('//li[@class=" j_thread_list clearfix"]'): item = FirstItem() item['title'] = line.xpath( './/div[contains(@class,"threadlist_title pull_left j_th_tit ")]/a/text()' ).extract() item['author'] = line.xpath( './/div[contains(@class,"threadlist_author pull_right")]//span[contains(@class,"frs-author-name-wrap")]/a/text()' ).extract() item['reply'] = line.xpath( './/div[contains(@class,"col2_left j_threadlist_li_left")]/span/text()' ).extract() yield item
def parse(self, response): a = 1 for line in response.xpath('//li[@class="List-item"]'): item = FirstItem() item['answer'] = a item['title'] = line.xpath( './/div[contains(@class,"ContentItem AnswerItem")]//h2/text()' ).extract() item['word'] = line.xpath( './/div[contains(@class,"ContentItem AnswerItem")]//p/text()' ).extract() a += 1 yield item
def parse(self, response): qiushi = response.xpath( "//div[@id='content']//div[contains(@class, 'article')]") for qs in qiushi: author = qs.xpath( "./div[contains(@class, 'author')]//h2/text()").extract() content = ''.join( qs.xpath( "./a[contains(@class, 'contentHerf')]/div/span//text()"). extract()) item = FirstItem() item['author'] = author item['content'] = content yield item
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = FirstItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = "https://hr.tencent.com/"+each.xpath('./td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] yield item #返回给pipelines process_item if self.offset < 540: self.offset += 10 nextPageUrl = self.url + str(self.offset) + "#a" else: return yield scrapy.Request(nextPageUrl, callback = self.parse)
def parse(self, response): ip_list = response.xpath('//table[@id="ip_list"]') trs = ip_list[0].xpath('tr') items = [] for ip in trs[1:]: pre_item = FirstItem() pre_item['IP'] = ip.xpath('td[2]/text()')[0].extract() pre_item['PORT'] = ip.xpath('td[3]/text()')[0].extract() pre_item['POSITION'] = ip.xpath( 'string(td[4])')[0].extract().strip() pre_item['TYPE'] = ip.xpath('td[6]/text()').extract() pre_item['SPEED'] = ip.xpath('td[7]/div[@class="bar"]/@title').re( '\d{0,2}\.\d{0,}')[0] pre_item['LAST_CHECK_TIME'] = ip.xpath('td[9]/text()')[0].extract() print pre_item items.append(pre_item) return items
def parse(self, response): item=FirstItem() item['title']=response.xpath('//h2[@class="csdn-tracking-statistics"]/a/text()').extract() #item['detail']=response.xpath('').extract() item['link']=response.xpath('//h2[@class="csdn-tracking-statistics"]/a/@href').extract() yield item
def parse(self, response): # pass item = FirstItem() item['content'] = response.xpath('/html/head/title/text()').extract() yield item
def parse(self, response): item = FirstItem() item["content"] = response.xpath(" /html/head/title/text()").extract() yield item
def parse(self, response): item = FirstItem() item['title'] = response.xpath('/html/head/title/text()').extract() print(item['title']) yield item