def test_insert(): s = SortedKeyValue(itemgetter(1), itemgetter(0)) s.insert((0, 'en.wikipedia.org')) assert s.find_le(10)[1] == 'en.wikipedia.org' s.insert((10, 'en.wikipedia.org')) s.insert((20, 'en.wikipedia.org')) assert s.find_le(20)[1] == 'en.wikipedia.org' assert len(s) == 1 assert len(s.keys) == 1 assert len(s.values) == 1
def test_remove(): s = SortedKeyValue(itemgetter(1), itemgetter(0)) s.insert((0, 'en.wikipedia.org')) s.remove((0, 'en.wikipedia.org')) assert len(s) == 0 assert len(s.keys) == 0 assert len(s.values) == 0 s.insert((20, 'en.wikipedia.org')) s.remove(('whatever', 'en.wikipedia.org')) assert len(s) == 0 assert len(s.keys) == 0 assert len(s.values) == 0
class URLFrontier(object): def __init__(self, stats, ignore_url=ignore_url): self.stats = stats if ignore_url is None: ignore_url = lambda url: False self.ignore_url = ignore_url self.urls = set() self.buckets = defaultdict(set) self.hosts = SortedKeyValue(key=itemgetter(1), value=itemgetter(0)) def get_waittime(self, domain): return int(time.time()) + 10 def add(self, origin, urls=None): hostname = urlparse(origin).hostname waittime = self.get_waittime(hostname) if not urls: urls = set([origin]) if not isinstance(urls, set): urls = set([urls]) new_urls = urls - self.urls if not self.ignore_url(origin): self.hosts.insert((waittime, hostname)) for url in new_urls: hostname = urlparse(url).hostname or '' if self.ignore_url(url): continue self.buckets[hostname].add(url) if hostname not in self.hosts: self.hosts.insert((0, hostname)) # log.debug('Found {} new urls'.format(len(new_urls))) self.urls.update(new_urls) self.stats['URLs frontier'] = len(self.urls) self.stats['hostnames'] = len(self.hosts) return waittime def pop(self): url = None while url is None: try: val, hostname = self.hosts.find_le(time.time()) except ValueError: return waittime = self.get_waittime(hostname) self.hosts.insert((waittime, hostname)) try: url = self.buckets[hostname].pop() except KeyError: del self.buckets[hostname] self.hosts.remove((0, hostname)) return url