def process_pages(self): skipped = [] pbar = ProgressBar(widgets=['Processing pages: ', SimpleProgress()], maxval=len(self.urls)).start() i = 0 for (num, url) in self.urls: pbar.update(int(num)) if (num and url): html = helpers.get_html(num, url) if html is not None: self.urls_with_nums[url] = num soup = BeautifulSoup(html.encode('utf-8', 'ignore'), 'lxml') page = Page(title=soup.title.string, num=num, html=soup.prettify(), url=url, text=soup.body.get_text()) page.index = i self.indices_with_pages[i] = page if page.ID not in self.pages_with_ids.keys(): self.pages_with_ids[page.ID] = page else: raise RuntimeError('COLLISION: %s collides with %s with hash %s.' % (page.num, self.pages_with_ids[page.ID].num, page.ID)) for link in soup.find_all('a'): if link.get('href') and 'mailto:' != link.get('href').strip()[0:7]: page.a.append(link) self.pages.append(page) i += 1 else: skipped.append(num) else: skipped.append(num) pbar.finish() print "Skipped page(s) %s because of an error." % (', '.join(skipped))