Пример #1
0
 def process_pages(self):
     skipped = []
     pbar = ProgressBar(widgets=['Processing pages: ', SimpleProgress()], maxval=len(self.urls)).start()
     i = 0
     
     for (num, url) in self.urls:
         pbar.update(int(num))
         if (num and url):
             html = helpers.get_html(num, url)
             if html is not None:
                 self.urls_with_nums[url] = num
                 soup = BeautifulSoup(html.encode('utf-8', 'ignore'), 'lxml')
                 page = Page(title=soup.title.string, num=num, html=soup.prettify(), url=url, text=soup.body.get_text())
                 page.index = i
                 self.indices_with_pages[i] = page
                 if page.ID not in self.pages_with_ids.keys():
                     self.pages_with_ids[page.ID] = page
                 else:
                     raise RuntimeError('COLLISION: %s collides with %s with hash %s.' % (page.num, self.pages_with_ids[page.ID].num, page.ID))
                 for link in soup.find_all('a'):
                     if link.get('href') and 'mailto:' != link.get('href').strip()[0:7]:
                         page.a.append(link)
                 self.pages.append(page)
                 i += 1
             else:
                 skipped.append(num)
         else:
             skipped.append(num)
     pbar.finish()
     print "Skipped page(s) %s because of an error." % (', '.join(skipped))