def parse(self, response): item = JiandanItem() hashList = response.css('.img-hash::text').extract() print(len(hashList)) for index, imgHash in enumerate(hashList): url = 'http:' + util.parse(imgHash, 'AGs7jEYU8SYmahnebE6Mvg6RCZsFysC9') replace = re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)', url) if replace: url = replace.group(1) + 'large' + replace.group(3) e = re.match(r'.*(\.\w+)', url) extensionName = e.group(1) file_path = os.path.join( "F:\\jiandan", str(self.size) + '-' + str(index) + extensionName) urllib.request.urlretrieve(url, file_path) print(url) yield item new_url = response.xpath( '//a[@class="previous-comment-page"]//@href').extract_first() # 翻页 if new_url: self.size += 1 if (self.size < 20): yield scrapy.Request('http:' + new_url, callback=self.parse)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath('//img//@src').extract() yield item next_url = response.xpath( '//a[@title="Older Comments"]/@href').extract() if next_url: yield Request(next_url[0], headers=self.headers)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath('//img//@src').extract() #提取图片链接 print('image_urls', item['image_urls']) yield item new_url = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() #翻页 print('new_url', new_url) if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath( '//img//@data-lazy-src').extract() # 提取图片链接 # print 'image_urls',item['image_urls'] yield item new_url = response.xpath( "//a[@class='page-numbers']//@href").extract_first() # 翻页 # print 'new_url',new_url if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): selector = Selector(response) pic = selector.xpath('//a[@class="view_img_link"]/@href').extract() item = JiandanItem() item['image_urls'] = pic yield item nextHref = selector.xpath('//a[@title="Older Comments"]/@href').extract() if nextHref: nextHref=nextHref[0] yield scrapy.Request(nextHref,callback=self.parse)
def parse(self, response): item = JiandanItem() url = response.xpath('//img//@src').extract() # 提取图片链接 item['image_urls'] = url # item['image_urls'] = response.xpath('//img//@src').extract() # 提取图片链接 print 'image_urls', item['image_urls'] yield item #new_url = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() # 翻页 new_url = response.xpath('//*[@id="page-next"]//@href').extract_first() print 'new_url', new_url if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath('//img//@src').extract() # 提取图片链接 yield item counter = 2 n = 90 while counter <= n: new_url = "http://www.meizitu.com" + "/a/more_" + str( counter) + ".html" # print 'new_url',new_url if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath( '//p//img//@src').extract() # 提取图片链接 # print 'image_urls',item['image_urls'] yield item #循环抓取图片链接存贮 new_url = response.xpath( '//a[@class="previous-comment-page"]//@href').extract_first() # 翻页 # print 'new_url',new_url if new_url: yield scrapy.Request(response.urljoin(new_url), callback=self.parse) #循环调用Parse
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath('//img//@src').extract() #提取图片链接 yield item ''' //*[@id="comments"]/div[3]/div/a[3] //*[@id="comments"]/div[3]/div/a[1] ''' new_url = response.xpath( '//a[@class="previous-comment-page"]//@href').extract_first() print(new_url) if new_url: yield scrapy.Request(new_url, callback=self.parse)
def parse_detail(self, response): title = response.meta.get('title') item = JiandanItem() item['pic'] = response.xpath( "//div[@class='articleV4Body']/p/a[1]/img/@src").extract() item['title'] = title yield item next_pic_url = response.xpath( "//div[@class='page-tag oh']//li[@id='nl']/a/@href").extract_first( ) if next_pic_url: yield scrapy.Request(parse.urljoin(response.url, next_pic_url), meta={"title": title}, callback=self.parse_detail)
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath('//img//@src').extract() #提取图片链接 # print 'image_urls',item['image_urls'] yield item if self.new_url != None: self.headers['Referer'] = self.new_url self.new_url = response.xpath( '//a[@class="previous-comment-page"]//@href').extract_first() #翻页 print '----new_url---', self.new_url if self.new_url: yield scrapy.Request(self.new_url, headers=self.headers, callback=self.parse)
def parse(self, response): item = JiandanItem() item["image_urls"] = response.xpath( '//div[@class="swipeboxEx"]/div[@class="list"]/a/img/@data-original' ).extract() url = "http://699pic.com" yield item new_url = '%s%s' % ( url, response.xpath( '//*[@id="wrapper"]/div[3]/a[@class="downPage"]//@href'). extract_first()) print(new_url) if new_url: print(new_url) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): #imgage urls img_list = response.xpath('//*[contains(@id,"comment-")]') next_page = response.xpath( '//*[@id="comments"]//div/a[contains(text(),"下一页")]//@href' )[0].extract() for img in img_list: item = JiandanItem() img_url = img.xpath('.//div//img//@src').extract() item['image_urls'] = img_url yield item #the next page if next_page: yield Request(url="http:" + next_page, dont_filter=True, callback=self.parse)
def parse(self, response): browser = webdriver.Chrome( "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" ) browser.maximize_window() browser.get(response.url) browser.implicitly_wait(15) a_list = browser.find_elements_by_link_text("[查看原图]") for a in a_list: item = JiandanItem() print(a.get_attribute('href')) item['img_url'] = a.get_attribute('href') yield item if self.page < 10: self.page += 1 yield scrapy.Request(self.url + str(self.page) + "#comments", self.parse)
def parse_detail(self, response): item = JiandanItem() item["title"] = response.xpath( "//h1[@class='thetitle']/a/text()").extract_first() item["name"] = response.xpath("//div[@class='postinfo']/text()" ).extract()[-1].split("@")[0].strip() item["date"] = response.xpath("//div[@class='postinfo']/text()" ).extract()[-1].split("@")[1].strip() item["content"] = ''.join( response.xpath( "//div[@class='entry']/p/text()").extract()).replace( "\n", "").strip() if '无聊图' in item["title"]: item["content"] = ';'.join( response.xpath( "//div[@class='entry']/p/img/@data-original").extract()) print(item) yield item
def parse(self, response): item = JiandanItem() item['image_urls'] = response.xpath( '//img//@src').extract() #get pic hyper_link print 'image_urls', item['image_urls'] yield item
def parse_item(self, response): il = ItemLoader(item=JiandanItem(), response=response) il.add_xpath('image_urls', '//a[@class="view_img_link"]/@href') return il.load_item()
def __init__(self): self.item = JiandanItem()