def getValidUrls(): allUrls = set() for index in xrange(10**6): lastSize = len(allUrls) urls = urlCollector.collectUrls(index) allUrls.update(urls) if len(allUrls) == lastSize: return allUrls
def main(): disk.getFileLockOrDie("locks/backend.pid") num_found = 0 for index in xrange(10**6): wasNew = False urls = urlCollector.collectUrls(index) num_found += len(urls) for url in urls: if not storage.isDiscovered(url): wasNew = True logging.info("Discovered new url: %s", url) storage.storeUrl(url) tobe.toDownload(url) if not wasNew: break if num_found == 0: logging.error("No valid URL discovered")