示例#1
0
 def parse_two(self, response):
     #传入上面的item1
     item2 = response.meta['item1']
     items = []
     html = response.text
     webSoup = BeautifulSoup(html)
     div_soups = webSoup.find_all('div', 'item masonry_brick masonry-brick')
     for div_soup in div_soups:
         imageURL = div_soup.find_all('div',
                                      'img')[0].find_all('img')[0]['src']
         title = div_soup.find_all('div', 'title')[0].text
         pageURL = div_soup.find_all('div',
                                     'img')[0].find_all('a')[0]['href']
         item = XiaohuaItem()
         item['meizhiURL'] = pageURL
         item['fileName'] = item2['fileName'] + '/' + title
         item['title'] = title
         items.append(item)
     for i in range(1):
         item = items[0]
         fileName = item['fileName']
         if not os.path.exists(fileName):
             os.makedirs(fileName)
         yield Request(url=item['meizhiURL'],
                       meta={'item2': item},
                       callback=self.parse_three)
示例#2
0
    def parse_one(self, response):
        #创建一个大的list存储所有的item
        base_url = "http://www.mmonly.cc"
        items = []
        websoup = BeautifulSoup(response.text)
        a_dicts = websoup.find_all('div', "TagList")[0].find_all('a')
        for a_text in a_dicts:
            #创建实例,并转化为字典
            main = (a_text['href'], a_text['title'])
            item = XiaohuaItem()
            item['siteURL'] = base_url + main[0]
            item['title'] = main[1]
            item['fileName'] = self.base + item['title']
            items.append(item)

        for i in range(1):
            #创建文件夹
            item = items[0]
            fileName = item['fileName']
            if not os.path.exists(fileName):
                os.makedirs(fileName)
            #用meta传入下一层
            yield Request(url=item['siteURL'],
                          meta={'item1': item},
                          callback=self.parse_two)
示例#3
0
 def parse_two(self,response):
     #传入上面的item1
     # print(response)
     item2=response.meta['item1']
     source=requests.get(response.url)
     source.encoding = source.apparent_encoding
     html=source.text
     #print(html)
     pattern = re.compile('共(.*?)页',re.S)
     # print(pattern)
     Num=re.search(pattern,html).group(1)
     # print(Num)
     items=[]
     for i in range(1,int(Num)+1):
         item=XiaohuaItem()
         item['fileName']=item2['fileName']
         #构造每一个图片的存储路径
         item['path']=item['fileName']+'/'+str(i)+'.jpg'
         #构造每一个图片入口链接,以获取源码中的原图链接
         item['pageURL']=response.url[:-5]+'_'+str(i)+'.html'
         items.append(item)
     for item in items:
         fileName = item['fileName']
         if not os.path.exists(fileName):
             os.makedirs(fileName)
         yield Request(url=item['pageURL'],meta={'item2':item},callback=self.parse_three)
示例#4
0
 def parse_three(self,response):
     item=XiaohuaItem()
     item3=response.meta['item2']
     #传入上面的item2
     pattern=re.compile(r'<li class="pic-down h-pic-down"><a target="_blank" class="down-btn" href=\'(.*?)\'>.*?</a>',re.S)
     URL=re.search(pattern,response.text).group(1)
     item['detailURL']=URL
     item['title']=item3['title']
     yield item
示例#5
0
 def parse_three(self, response):
     item3 = response.meta['item2']
     html = response.text.encode('utf-8')
     # 用正则提取页数
     pattern = re.compile(r'共(.*?)页', re.S)
     Num = re.search(pattern, html).group(1)
     for i in range(1, int(Num) + 1):
         item = XiaohuaItem()
         item['fileName'] = item3['fileName']
         item['path'] = item['fileName'] + '/' + str(i) + '.jpg'
     '''
示例#6
0
    def parse_one(self,response):
        #创建一个大的list存储所有的item
        items=[]
        pattern=re.compile(r'<div class="title".*?<a.*?href="(.*?)">(.*?)</a></span></div>',re.S)
        mains=re.findall(pattern,response.text)
        for main in mains:
            #创建实例,并转化为字典
            item=XiaohuaItem()
            item['siteURL']=main[0]
            item['title']=main[1]
            items.append(item)

        for item in items:
            #用meta传入下一层
            yield Request(url=item['siteURL'],meta={'item1':item},callback=self.parse_two)
示例#7
0
 def parse_two(self,response):
     #传入上面的item1
     item2=response.meta['item1']
     source=requests.get(response.url)
     html=source.text.encode('utf-8')
     pattern=re.compile(r'共(.*?)页',re.S)
     Num=re.search(pattern,html.decode('utf-8')).group(1)
     items=[]
     for i in range(1,int(Num)+1):
         item=XiaohuaItem()
         #构造每一个图片入口链接,以获取源码中的原图链接
         item['title']=item2['title']
         item['pageURL']=response.url[:-5]+'_'+str(i)+'.html'
         items.append(item)
     for item in items:
         yield Request(url=item['pageURL'],meta={'item2':item},callback=self.parse_three)