def parse_summary(self, summary, link): #summary = escape.utf8(summary) soup = BeautifulSoup(summary) for script in list(soup.findAll('script')): script.extract() for o in soup.findAll(onload=True): del o['onload'] for script in list(soup.findAll('noscript')): script.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] for tag in self.remove_tags: for x in soup.findAll(tag['name']): x.extract() for base in list(soup.findAll(['base', 'iframe'])): base.extract() #for p in list(soup.findAll(['p', 'div'])): # p['style'] = 'text-indent:2em' img_count = 1 for img in list(soup.findAll('img')): if self.noimage or img_count >= self.max_images: img.extract() else: image_url = absolute_path(img['src'], link) image = self.down_image(image_url, link) if image: img['src'] = image else: img.extract() img_count = img_count + 1 return soup.renderContents('utf-8')