def parse(self, response): for sel in response.xpath('//ul/li'): item = HelloItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() return item
def parse(self, response): for quote in response.css('div.quote'): l = MainLoader(item=HelloItem(), selector=quote,response=response) l.add_css('text', 'span.text::text') l.add_css('author', '.quote small.author::text') l.add_css('tags', 'div.tags a.tag::text') yield l.load_item()
def parse(self, response): sel = scrapy.Selector(response) print(response.body) for h3 in sel.xpath('//h3').extract(): for a in sel.xpath('//h3/a/@href').extract(): a = self.url + a yield scrapy.Request(a) # meta={'dont_redirect':True} # title = scrapy.Field() # type1 = scrapy.Field() # actor = scrapy.Field() # languge = scrapy.Field() # time = scrapy.Field() # link = scrapy.Field() # desc = scrapy.Field() # img = scrapy.Field() # last_updated = scrapy.Field(serializer=str) h1 = sel.xpath('//h1/text()').extract() type = sel.xpath( '//span[@class="span_block"]/a/text()').extract() actor = sel.xpath( '//*[@id="minfo"]/div[2]/span[4]/span/a/text()').extract() desc = sel.xpath( '//*[@id="movie_content"]/text()[1]').extract() link = sel.xpath( '//*[@id="myform"]/ul/li[2]/span[2]/a/@href').extract() #languge=sel.xpath('//') yield HelloItem(title=h1, type1=type, actor=actor, desc=desc, link=link)
def parse_item(self, response): for sel in response.css('div.post-desc'): subsel = sel.css('div.post-title h2.entry-title a') item = HelloItem() item['title'] = subsel.xpath('text()').extract() item['text'] = sel.css('div.post-excerpt::text').extract() item['link'] = subsel.xpath('@href').extract() yield item
def parse(self, response): # title=response.xpath("/html/head/title").extract() for sel in response.xpath('//a'): item = HelloItem() item['title'] = sel.xpath('text()').extract() item['link'] = sel.xpath('@href').extract() # item['desc'] = sel.xpath('text()').extract() yield item
def parse_item(self, response): self.browser = webdriver.PhantomJS() self.browser.implicitly_wait(10) self.browser.get(response.url) item = HelloItem() item['app_name'] = response.xpath( '//div[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[1]/div[1]/text()' ).extract()[0] item['score'] = response.xpath( '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[2]/div[2]/text()' ).extract()[0] item['detail'] = response.xpath('//*[@id="J_DetAppDataInfo"]/div[1]' ).xpath('string(.)').extract()[0] item['down_num'] = response.xpath( '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[3]/div[1]/text()' ).extract()[0] item['category'] = response.xpath( '//*[@id="J_DetCate"]/text()').extract()[0] item['com_num'] = self.browser.find_element_by_xpath( '//*[@id="J_CommentCount"]').text #com_list=self.browser.find_element_by_id("J_DetShowCommentList") try: for i in range(0, 100): self.browser.find_element_by_id( "J_DetCommentShowMoreBtn").click() except: print "error" item['com_name'] = self.browser.find_element_by_id( "J_DetShowCommentList").text #item['com_time']=self.browser.find_element_by_xpath('//*[@id="J_DetShowCommentList"]/li[2]/div[1]/div[1]/div[3]').text #item['com_text']=self.browser.find_elements_by_xpath('//*[@id="J_DetShowCommentList"]/li[2]/div[1]/div[2]').text #print "got name " + self.browser.find_element_by_class_name("comment-name").text #for com_name in com_names: # print "get name "+com_name.text #for com_text in com_texts: # print "get text "+com_text.text #print "got date "+ self.browser.find_element_by_class_name("comment-date").text #print "got title " + #print "got text " + self.browser.find_element_by_class_name("comment-datatext").text #print "got details "+ item['detail'] #print "got category "+ item['category'] yield item #for com_site in com_sites: # print com_site,xpath('div[1]/div[1]/div[1]/text()').extract()[0] def __del__(self): self.browser.close()
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = HelloItem() text = quote.css('.text::text').extract_first() # 输出文本值 author = quote.css( '.author::text').extract_first() # extract_first()第一个 tags = quote.css('.tags .tag::text').extract() # extract() 所有 item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) # 获取绝对的路径 print(url) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): l = ItemLoader(item=HelloItem(), response=response) l.add_css('text', 'h2.entry-title a::text') l.add_css('text', 'h2.entry-title a::attr(href)') return l.load_item()