def build_index(self): #========== added ========= downloaded_list = DownloadedArticlesList(self.download_history_file) #========================== self.report_progress(0, _('Fetching feeds...')) try: feeds = feeds_from_index( self.parse_index(), oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) self.report_progress(0, _('Got feeds from index page')) except NotImplementedError: feeds = self.parse_feeds() #========== reworked ========= for feed in feeds: feed.articles = filter( lambda article: article.url not in downloaded_list, feed.articles) # Filer out empty feeds if self.ignore_duplicate_articles is not None: feeds = self.remove_duplicate_articles(feeds) feeds = filter(lambda feed: len(feed.articles), feeds) if not feeds: raise ValueError('No articles found, aborting') #============================= #feeds = FeedCollection(feeds) self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) with open(index, 'wb') as fi: fi.write(html) self.jobs = [] if self.reverse_article_order: for feed in feeds: if hasattr(feed, 'reverse'): feed.reverse() self.feed_objects = feeds for f, feed in enumerate(feeds): feed_dir = os.path.join(self.output_dir, 'feed_%d' % f) if not os.path.isdir(feed_dir): os.makedirs(feed_dir) for a, article in enumerate(feed): #========== refactored ========= art_dir = os.path.join(feed_dir, 'article_%d' % a) if not os.path.isdir(art_dir): os.makedirs(art_dir) downloaded_list.add(article.url) url = self.feed_settings[feed.title].print_version_url( article.url) req = WorkRequest( self.feed_settings[feed.title].fetch, (self, article, url, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) #=============================== req.feed = feed req.article = article req.feed_dir = feed_dir self.jobs.append(req) self.jobs_done = 0 tp = ThreadPool(self.simultaneous_downloads) for req in self.jobs: tp.putRequest(req, block=True, timeout=0) self.report_progress( 0, _('Starting download [%d thread(s)]...') % self.simultaneous_downloads) while True: try: tp.poll() time.sleep(0.1) except NoResultsPending: break for f, feed in enumerate(feeds): html = self.feed2index(f, feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d' % f) with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) self.create_opf(feeds) self.report_progress(1, _('Feeds downloaded to %s') % index) #========== added ========= downloaded_list.close() #========================== return index
def build_index(self): #========== added ========= downloaded_list = DownloadedArticlesList(self.download_history_file) #========================== self.report_progress(0, _('Fetching feeds...')) try: feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) self.report_progress(0, _('Got feeds from index page')) except NotImplementedError: feeds = self.parse_feeds() #========== reworked ========= for feed in feeds: feed.articles = filter(lambda article: article.url not in downloaded_list, feed.articles) # Filer out empty feeds if self.ignore_duplicate_articles is not None: feeds = self.remove_duplicate_articles(feeds) feeds = filter(lambda feed: len(feed.articles), feeds) if not feeds: raise ValueError('No articles found, aborting') #============================= #feeds = FeedCollection(feeds) self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) with open(index, 'wb') as fi: fi.write(html) self.jobs = [] if self.reverse_article_order: for feed in feeds: if hasattr(feed, 'reverse'): feed.reverse() self.feed_objects = feeds for f, feed in enumerate(feeds): feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) if not os.path.isdir(feed_dir): os.makedirs(feed_dir) for a, article in enumerate(feed): #========== refactored ========= art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) downloaded_list.add(article.url) url = self.feed_settings[feed.title].print_version_url(article.url) req = WorkRequest( self.feed_settings[feed.title].fetch, (self, article, url, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) #=============================== req.feed = feed req.article = article req.feed_dir = feed_dir self.jobs.append(req) self.jobs_done = 0 tp = ThreadPool(self.simultaneous_downloads) for req in self.jobs: tp.putRequest(req, block=True, timeout=0) self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) while True: try: tp.poll() time.sleep(0.1) except NoResultsPending: break for f, feed in enumerate(feeds): html = self.feed2index(f,feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) self.create_opf(feeds) self.report_progress(1, _('Feeds downloaded to %s')%index) #========== added ========= downloaded_list.close() #========================== return index
def build_index(self, data, browser): sections = data.get('index', None) if not sections: raise ValueError('No articles found, aborting') feeds = feeds_from_index(sections, oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) if not feeds: raise ValueError('No articles found, aborting') if self.ignore_duplicate_articles is not None: feeds = self.remove_duplicate_articles(feeds) if self.test: feeds = feeds[:self.test[0]] self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) with open(index, 'wb') as fi: fi.write(html) if self.reverse_article_order: for feed in feeds: if hasattr(feed, 'reverse'): feed.reverse() self.report_progress(0, _('Got feeds from index page')) resource_cache = {} total = 0 for feed in feeds: total += min(self.max_articles_per_feed, len(feed)) num = 0 for f, feed in enumerate(feeds): feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) if not os.path.isdir(feed_dir): os.makedirs(feed_dir) for a, article in enumerate(feed): if a >= self.max_articles_per_feed: break num += 1 art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) try: url = self.print_version(article.url) except NotImplementedError: url = article.url except: self.log.exception('Failed to find print version for: '+article.url) url = None if not url: continue self.log.debug('Downloading article:', article.title, 'from', url) try: pages = fetch_page( url, load_complete=self.load_complete, links=self.select_links, remove=self.remove_tags, keep_only=self.keep_only_tags, preprocess_browser=partial(self._preprocess_browser, article), postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)), remove_before=self.remove_tags_before, remove_after=self.remove_tags_after, remove_javascript=self.remove_javascript, delay=self.delay, resource_cache=resource_cache, output_dir=art_dir, browser=browser) except AbortFetch: self.log.exception('Fetching of article: %r aborted' % article.title) continue except Exception: self.log.exception('Fetching of article: %r failed' % article.title) continue self.log.debug('Downloaded article:', article.title, 'from', article.url) article.orig_url = article.url article.url = 'article_%d/index.html'%a article.downloaded = True article.sub_pages = pages[1:] self.report_progress(float(num)/total, _(u'Article downloaded: %s')%force_unicode(article.title)) for f, feed in enumerate(feeds): html = self.feed2index(f, feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) if self.no_stylesheets: for f in walk(self.output_dir): if f.endswith('.css'): os.remove(f) self.create_opf(feeds) self.report_progress(1, _('Download finished')) return index