def fetch(self, url): """ Fetch url and return a file-like representation. """ fname = os.path.join(self._cachedir, self._formatter(url)) if not os.path.exists(fname): time.sleep(self._sleep) html = urllib.urlopen(url).read() with codecs.open(fname, 'w', 'utf-8') as f: soup = BeautifulSoup(html) f.write(unicode(soup)) return fname
def run(sitemapurl, patt, cachedir, cachejournal, sleep=5): """ Args: sitemapurl: A string URL to an XML sitemap. patt: A string used for substring matching of the urls in the sitemap. cachedir: Directory used to cache downloaded HTML files. cachejournal: A string filename to store records about the cache directory. Should be considered a tmp file. sleep: Integer amount of time to sleep between HTTP requests, in seconds. """ fetcher = CacheFetcher(cachedir, filename_formatter, sleep) sitemap = urllib.urlopen(sitemapurl) with open(cachejournal, 'w') as journal: for url in extract_sitemap(sitemap, patt): fname = fetcher.fetch(url) journal.write('{0},{1}\n'.format(fname, url))