示例#1
0
    def parse(self, response):
        item = JiandanItem()
        hashList = response.css('.img-hash::text').extract()
        print(len(hashList))

        for index, imgHash in enumerate(hashList):
            url = 'http:' + util.parse(imgHash,
                                       'AGs7jEYU8SYmahnebE6Mvg6RCZsFysC9')
            replace = re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)', url)
            if replace:
                url = replace.group(1) + 'large' + replace.group(3)
            e = re.match(r'.*(\.\w+)', url)
            extensionName = e.group(1)
            file_path = os.path.join(
                "F:\\jiandan",
                str(self.size) + '-' + str(index) + extensionName)
            urllib.request.urlretrieve(url, file_path)
            print(url)

        yield item

        new_url = response.xpath(
            '//a[@class="previous-comment-page"]//@href').extract_first()  # 翻页

        if new_url:
            self.size += 1
            if (self.size < 20):
                yield scrapy.Request('http:' + new_url, callback=self.parse)
示例#2
0
    def parse(self, response):
        item = JiandanItem()
        item['image_urls'] = response.xpath('//img//@src').extract()
        yield item

        next_url = response.xpath(
            '//a[@title="Older Comments"]/@href').extract()
        if next_url:
            yield Request(next_url[0], headers=self.headers)
示例#3
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath('//img//@src').extract() #提取图片链接
     print('image_urls', item['image_urls'])
     yield item
     new_url = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() #翻页
     print('new_url', new_url)
     if new_url:
         yield scrapy.Request(new_url, callback=self.parse)
示例#4
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath(
         '//img//@data-lazy-src').extract()  # 提取图片链接
     # print 'image_urls',item['image_urls']
     yield item
     new_url = response.xpath(
         "//a[@class='page-numbers']//@href").extract_first()  # 翻页
     # print 'new_url',new_url
     if new_url:
         yield scrapy.Request(new_url, callback=self.parse)
示例#5
0
    def parse(self, response):
        selector = Selector(response)
        pic = selector.xpath('//a[@class="view_img_link"]/@href').extract()
        item = JiandanItem()
        item['image_urls'] = pic
        yield item

        nextHref = selector.xpath('//a[@title="Older Comments"]/@href').extract()
        if nextHref:
            nextHref=nextHref[0]
            yield scrapy.Request(nextHref,callback=self.parse)
示例#6
0
 def parse(self, response):
     item = JiandanItem()
     url = response.xpath('//img//@src').extract()  # 提取图片链接
     item['image_urls'] = url
     # item['image_urls'] = response.xpath('//img//@src').extract()  # 提取图片链接
     print 'image_urls', item['image_urls']
     yield item
     #new_url = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first()  # 翻页
     new_url = response.xpath('//*[@id="page-next"]//@href').extract_first()
     print 'new_url', new_url
     if new_url:
         yield scrapy.Request(new_url, callback=self.parse)
示例#7
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath('//img//@src').extract()  # 提取图片链接
     yield item
     counter = 2
     n = 90
     while counter <= n:
         new_url = "http://www.meizitu.com" + "/a/more_" + str(
             counter) + ".html"
         # print 'new_url',new_url
         if new_url:
             yield scrapy.Request(new_url, callback=self.parse)
示例#8
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath(
         '//p//img//@src').extract()  # 提取图片链接
     # print 'image_urls',item['image_urls']
     yield item  #循环抓取图片链接存贮
     new_url = response.xpath(
         '//a[@class="previous-comment-page"]//@href').extract_first()  # 翻页
     # print 'new_url',new_url
     if new_url:
         yield scrapy.Request(response.urljoin(new_url),
                              callback=self.parse)  #循环调用Parse
示例#9
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath('//img//@src').extract()  #提取图片链接
     yield item
     '''
     //*[@id="comments"]/div[3]/div/a[3]
     //*[@id="comments"]/div[3]/div/a[1]
     '''
     new_url = response.xpath(
         '//a[@class="previous-comment-page"]//@href').extract_first()
     print(new_url)
     if new_url:
         yield scrapy.Request(new_url, callback=self.parse)
示例#10
0
 def parse_detail(self, response):
     title = response.meta.get('title')
     item = JiandanItem()
     item['pic'] = response.xpath(
         "//div[@class='articleV4Body']/p/a[1]/img/@src").extract()
     item['title'] = title
     yield item
     next_pic_url = response.xpath(
         "//div[@class='page-tag oh']//li[@id='nl']/a/@href").extract_first(
         )
     if next_pic_url:
         yield scrapy.Request(parse.urljoin(response.url, next_pic_url),
                              meta={"title": title},
                              callback=self.parse_detail)
示例#11
0
    def parse(self, response):

        item = JiandanItem()
        item['image_urls'] = response.xpath('//img//@src').extract()  #提取图片链接
        # print 'image_urls',item['image_urls']
        yield item
        if self.new_url != None:
            self.headers['Referer'] = self.new_url
        self.new_url = response.xpath(
            '//a[@class="previous-comment-page"]//@href').extract_first()  #翻页
        print '----new_url---', self.new_url
        if self.new_url:
            yield scrapy.Request(self.new_url,
                                 headers=self.headers,
                                 callback=self.parse)
示例#12
0
 def parse(self, response):
     item = JiandanItem()
     item["image_urls"] = response.xpath(
         '//div[@class="swipeboxEx"]/div[@class="list"]/a/img/@data-original'
     ).extract()
     url = "http://699pic.com"
     yield item
     new_url = '%s%s' % (
         url,
         response.xpath(
             '//*[@id="wrapper"]/div[3]/a[@class="downPage"]//@href').
         extract_first())
     print(new_url)
     if new_url:
         print(new_url)
         yield scrapy.Request(new_url, callback=self.parse)
示例#13
0
    def parse(self, response):
        #imgage urls
        img_list = response.xpath('//*[contains(@id,"comment-")]')
        next_page = response.xpath(
            '//*[@id="comments"]//div/a[contains(text(),"下一页")]//@href'
        )[0].extract()
        for img in img_list:
            item = JiandanItem()
            img_url = img.xpath('.//div//img//@src').extract()
            item['image_urls'] = img_url
            yield item

        #the next page
        if next_page:
            yield Request(url="http:" + next_page,
                          dont_filter=True,
                          callback=self.parse)
示例#14
0
 def parse(self, response):
     browser = webdriver.Chrome(
         "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
     )
     browser.maximize_window()
     browser.get(response.url)
     browser.implicitly_wait(15)
     a_list = browser.find_elements_by_link_text("[查看原图]")
     for a in a_list:
         item = JiandanItem()
         print(a.get_attribute('href'))
         item['img_url'] = a.get_attribute('href')
         yield item
     if self.page < 10:
         self.page += 1
     yield scrapy.Request(self.url + str(self.page) + "#comments",
                          self.parse)
示例#15
0
 def parse_detail(self, response):
     item = JiandanItem()
     item["title"] = response.xpath(
         "//h1[@class='thetitle']/a/text()").extract_first()
     item["name"] = response.xpath("//div[@class='postinfo']/text()"
                                   ).extract()[-1].split("@")[0].strip()
     item["date"] = response.xpath("//div[@class='postinfo']/text()"
                                   ).extract()[-1].split("@")[1].strip()
     item["content"] = ''.join(
         response.xpath(
             "//div[@class='entry']/p/text()").extract()).replace(
                 "\n", "").strip()
     if '无聊图' in item["title"]:
         item["content"] = ';'.join(
             response.xpath(
                 "//div[@class='entry']/p/img/@data-original").extract())
     print(item)
     yield item
示例#16
0
 def parse(self, response):
     item = JiandanItem()
     item['image_urls'] = response.xpath(
         '//img//@src').extract()  #get pic hyper_link
     print 'image_urls', item['image_urls']
     yield item
示例#17
0
    def parse_item(self, response):

        il = ItemLoader(item=JiandanItem(), response=response)
        il.add_xpath('image_urls', '//a[@class="view_img_link"]/@href')
        return il.load_item()
示例#18
0
 def __init__(self):
     self.item = JiandanItem()