Exemplo n.º 1
0
    def parse_summary(self, summary, link):
        
        #summary = escape.utf8(summary)
        soup = BeautifulSoup(summary)
        
        for script in list(soup.findAll('script')):
            script.extract()
            
        for o in soup.findAll(onload=True):
            del o['onload']
            
        for script in list(soup.findAll('noscript')):
            script.extract()
            
        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]
                
        for tag in self.remove_tags:
            for x in soup.findAll(tag['name']):
                x.extract()
                
        for base in list(soup.findAll(['base', 'iframe'])):
            base.extract()
            
        #for p in list(soup.findAll(['p', 'div'])):
        #    p['style'] = 'text-indent:2em'
        
        img_count = 1
        for img in list(soup.findAll('img')):
            
            if self.noimage or img_count >= self.max_images:
                img.extract()
            else:
                image_url = absolute_path(img['src'], link)
                image = self.down_image(image_url, link)

                if image:
                    img['src'] = image
                else:
                    img.extract()
                    
            img_count = img_count + 1
        
        return soup.renderContents('utf-8')