def parse(self, response): imglist = response.xpath("//ul[@class='list-group']") for img in imglist: imgitem = DoutuItem() imgurl = img.xpath(".//a/img/@data-original").extract() urls = list(map(lambda url: response.urljoin(url), imgurl)) item = DoutuItem(image_urls=urls) yield item # 获取下一页内容 i = 0 next_link = response.xpath( '//*[@id="pic-detail"]/div/div[2]/div[3]/ul/li[15]/a/@href' ).extract() # self.log(next_link) next_link = response.urljoin(next_link) if next_link is not None: for url in next_link: url = "http://www.doutula.com" + url yield scrapy.Request(url, callback=self.parse) i = i + 1 print(i) print("*" * 20) else: print("---" * 15)
def parse_link(self, response): title = response.meta.get('info') ps = response.xpath("//div[@id='post_content']/p") for p in ps: pic_url = p.xpath(".//img/@src").get() print(pic_url) yield DoutuItem(pic_url=pic_url, title=title)
def parse_img(self, response): div_list = response.xpath( "//div[@class='pic-content']/div[@class='artile_des']") for div in div_list: item = DoutuItem() item["image_urls"] = div.xpath(".//img/@src").extract() print(item) yield item
def parse_item(self, response): i = DoutuItem() # 图片管道字典设置, i['image_url'] = response.xpath( ".//div[@class='pic-content']//img/@src").extract() i['image_name'] = response.xpath( ".//div[@class='pic-title']//a/text()").extract() print(i)
def parse_img(self, response): doutu = DoutuItem() doutu['image_urls'] = response.xpath( '//div[@class="swiper-slide"]//img/@src').extract()[0] yield doutu
def parse(self, response): # response 是上面网址请求到的源代码 for content in response.xpath('//*[@id="pic-detail"]/div/div[1]/div[1]/ul/li/div/div/a'): item = DoutuItem() # 实例化容器 # print(content) item['img_url'] = content.xpath('./img/@data-original').extract_first() item['name'] = content.xpath('./p/text()').extract_first() yield item
def parse(self, response): lis = response.xpath('//div[@class="page-content text-center"]//a') for li in lis: item = DoutuItem() item['name'] = li.xpath("./img/@alt").extract_first() item['image_urls'] = li.xpath( "./img/@data-original").extract_first() yield item MAX_PAGES = self.settings['MAX_PAGES'] for page in range(2, MAX_PAGES): url = "https://www.doutula.com/photo/list/?page=%d" % page yield scrapy.Request(url, callback=self.parse)
def parse(self, response): imglist =response.xpath("//ul[@class='list-group']") for img in imglist: item=DoutuItem() imgurl=img.xpath(".//a/img/@data-original").extract() # print(imgurl) urls=list(map(lambda url:response.urljoin(url),imgurl)) item=DoutuItem(image_urls=urls) # for i in imgurl: # item["image_urls"]="https:"+i # print(item["image_urls"]) yield item next_link = response.xpath('//*[@id="pic-detail"]/div/div[2]/div[3]/ul/li[13]/a/@href').extract() # self.log(next_link) # next_link = response.urljoin(next_link) print(next_link) if next_link is not None: for url in next_link: url = "http://www.doutula.com" + url yield scrapy.Request(url, callback=self.parse) # next_link = next_link[0] # yield scrapy.Request("http://www.doutula.com"+next_link,callback=self.parse)
def parse(self, response): # response 是上面网址请求到的源代码 items_list = [] for content in response.xpath('//a[@class="col-xs-6 col-sm-3"]'): item = DoutuItem() # 实例化容器 item['img_url'] = content.xpath('./img/@data-original').extract_first() item['name'] = content.xpath('./p/text()').extract_first() # items_list.append(item) # print(items_list) # try: # filename = 'imgs\{}'.format(item['name']) + item['img_url'][-4:] # 图片路径 # if not os.path.exists(filename): # headers = {'User-Agent': "'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'"} # r = requests.get(item['img_url'], headers=headers) # with open(filename,'wb') as f: # f.write(r.content) # print('保存成功!') # except Exception as e: # print(e) yield item
def parse_item(self,response): self.logger.info('hi,this is an item page! %s',response.url) item = DoutuItem() # item['url'] = [] item['file_urls'] = response.xpath('//*[@id="pic-detail"]/div/div/div//@data-original').extract() return item