def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) settings = manager.settings self.overused_buffer = OverusedBuffer( super(MemoryDFSOverusedBackend, self).get_next_requests, settings.get("OVERUSED_MAX_QUEUE_SIZE"), settings.get("OVERUSED_MAX_KEYS"))
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer( self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id)
def test_purging_keys_set(self): self.generate_requests() self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 1000, 100, 10, 1) ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 10 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 20 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") # purging of keys set assert (ob._get_key_count()) < 20
def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
class MemoryDFSOverusedBackend(MemoryDFSBackend): def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests) def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs)
def test(self): ob = OverusedBuffer(self.get_func, self.log_func) self.requests = [r1, r2, r3, r4, r5, r6] assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']", "Pending: 0"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] assert set(self.logs) == set(["Overused keys: ['www.example.com']", "Pending: 4"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] assert set(self.logs) == set(["Overused keys: ['www.example.com']", "Pending: 3"]) self.logs = [] #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set(ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) assert set(self.logs) == set(["Overused keys: ['example.com']", "Pending: 3"]) self.logs = [] assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] assert set(self.logs) == set(["Overused keys: []", "Pending: 0"])
def test_purging_keys(self): self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 10, 100) ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") assert ob._get_pending_count() == 9 ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") # purging of www.example.com assert ob._get_pending_count() == 7
class KafkaOverusedBackend(KafkaBackend): component_name = 'Kafka Backend taking into account overused slots' def __init__(self, manager): super(KafkaOverusedBackend, self).__init__(manager) self._buffer = OverusedBuffer(super(KafkaOverusedBackend, self).get_next_requests, manager.logger.manager.debug) def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs)
def test_purging_keys(self): self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 10, 1, 100, 10) ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") assert ob._get_pending_count() == 9 ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") # purging of www.example.com assert ob._get_pending_count() == 7
class MemoryDFSOverusedBackend(MemoryDFSBackend): def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) settings = manager.settings self.overused_buffer = OverusedBuffer( super(MemoryDFSOverusedBackend, self).get_next_requests, settings.get("OVERUSED_MAX_QUEUE_SIZE"), settings.get("OVERUSED_MAX_KEYS")) def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id)
class MessageBusBackend(Backend): def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.spider_log_producer.flush() self.consumer.close() def add_seeds(self, seeds): raise NotImplementedError("The seeds addition using spider log isn't allowed") def page_crawled(self, response): host_fprint = get_host_fprint(response) self.spider_log_producer.send(host_fprint, self._encoder.encode_page_crawled(response)) def links_extracted(self, request, links): per_host = aggregate_per_host(links) for host_fprint, host_links in six.iteritems(per_host): self.spider_log_producer.send(host_fprint, self._encoder.encode_links_extracted(request, host_links)) def request_error(self, page, error): host_fprint = get_host_fprint(page) self.spider_log_producer.send(host_fprint, self._encoder.encode_request_error(page, error)) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): try: request = self._decoder.decode_request(encoded) except Exception as exc: self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc))) else: requests.append(request) self.spider_log_producer.send(b'0123456789abcdef0123456789abcdef012345678', self._encoder.encode_offset(self.partition_id, self.consumer.get_offset(self.partition_id))) return requests def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs) def finished(self): return False @property def metadata(self): return None @property def queue(self): return None @property def states(self): return None
class MessageBusBackend(Backend): def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug) @classmethod def from_manager(clas, manager): return clas(manager) def frontier_start(self): pass def frontier_stop(self): self.spider_log_producer.flush() def add_seeds(self, seeds): self.spider_log_producer.send(seeds[0].meta['fingerprint'], self._encoder.encode_add_seeds(seeds)) def page_crawled(self, response, links): self.spider_log_producer.send( response.meta['fingerprint'], self._encoder.encode_page_crawled(response, links)) def request_error(self, page, error): self.spider_log_producer.send( page.meta['fingerprint'], self._encoder.encode_request_error(page, error)) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): try: request = self._decoder.decode_request(encoded) requests.append(request) except ValueError: self._manager.logger.backend.warning( "Could not decode message: {0}".format(encoded)) pass self.spider_log_producer.send( '0123456789abcdef0123456789abcdef012345678', self._encoder.encode_offset(self.partition_id, self.consumer.get_offset())) return requests def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs) def finished(self): return False @property def metadata(self): return None @property def queue(self): return None @property def states(self): return None
def test_purging_keys_set(self): self.generate_requests() self.req_it = cycle(self.requests) ob = OverusedBuffer(self.get_once, 1000, 10) ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 10 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 20 ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") # purging of keys set assert (ob._get_key_count()) < 20
def test_base(self): self.req_it = iter(self.requests) ob = OverusedBuffer(self.get_once, 100, 10000) assert ob._get_pending_count() == 0 assert set( ob.get_next_requests( 10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) assert ob._get_pending_count() == 4 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] assert ob._get_pending_count() == 3 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] assert ob._get_pending_count() == 3 #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set( ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) assert ob._get_pending_count() == 0 assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] assert ob._get_pending_count() == 0
def __init__(self, manager): super(KafkaOverusedBackend, self).__init__(manager) self._buffer = OverusedBuffer(super(KafkaOverusedBackend, self).get_next_requests, manager.logger.manager.debug)
class MessageBusBackend(Backend): def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod def from_manager(clas, manager): return clas(manager) def frontier_start(self): pass def frontier_stop(self): self.spider_log_producer.flush() def add_seeds(self, seeds): self.spider_log_producer.send(seeds[0].meta['fingerprint'], self._encoder.encode_add_seeds(seeds)) def page_crawled(self, response, links): self.spider_log_producer.send(response.meta['fingerprint'], self._encoder.encode_page_crawled(response, links)) def request_error(self, page, error): self.spider_log_producer.send(page.meta['fingerprint'], self._encoder.encode_request_error(page, error)) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): try: request = self._decoder.decode_request(encoded) except Exception as exc: self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc))) else: requests.append(request) self.spider_log_producer.send('0123456789abcdef0123456789abcdef012345678', self._encoder.encode_offset(self.partition_id, self.consumer.get_offset())) return requests def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs) def finished(self): return False @property def metadata(self): return None @property def queue(self): return None @property def states(self): return None
class MessageBusBackend(Backend): def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug) @classmethod def from_manager(clas, manager): return clas(manager) def frontier_start(self): pass def frontier_stop(self): self.spider_log_producer.flush() def add_seeds(self, seeds): self.spider_log_producer.send(seeds[0].meta['fingerprint'], self._encoder.encode_add_seeds(seeds)) def page_crawled(self, response, links): self.spider_log_producer.send(response.meta['fingerprint'], self._encoder.encode_page_crawled(response, links)) def request_error(self, page, error): self.spider_log_producer.send(page.meta['fingerprint'], self._encoder.encode_request_error(page, error)) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): try: request = self._decoder.decode_request(encoded) requests.append(request) except ValueError: self._manager.logger.backend.warning("Could not decode message: {0}".format(encoded)) pass self.spider_log_producer.send('0123456789abcdef0123456789abcdef012345678', self._encoder.encode_offset(self.partition_id, self.consumer.get_offset())) return requests def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs) def finished(self): return False @property def metadata(self): return None @property def queue(self): return None @property def states(self): return None
def __init__(self, manager): super(KafkaOverusedBackend, self).__init__(manager) self._buffer = OverusedBuffer( super(KafkaOverusedBackend, self).get_next_requests, manager.logger.manager.debug)
def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests)
class MessageBusBackend(Backend): def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") store_content = settings.get('STORE_CONTENT') self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer( self._get_next_requests, max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), max_keys=settings.get('OVERUSED_MAX_KEYS'), keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.spider_log_producer.flush() self.consumer.close() def add_seeds(self, seeds): raise NotImplemented( "The seeds addition using spider log isn't allowed") def page_crawled(self, response): host_fprint = get_host_fprint(response) self.spider_log_producer.send( host_fprint, self._encoder.encode_page_crawled(response)) def links_extracted(self, request, links): per_host = aggregate_per_host(links) for host_fprint, host_links in six.iteritems(per_host): self.spider_log_producer.send( host_fprint, self._encoder.encode_links_extracted(request, host_links)) def request_error(self, page, error): host_fprint = get_host_fprint(page) self.spider_log_producer.send( host_fprint, self._encoder.encode_request_error(page, error)) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): try: request = self._decoder.decode_request(encoded) except Exception as exc: self._logger.warning( "Could not decode message: {0}, error {1}".format( encoded, str(exc))) else: requests.append(request) self.spider_log_producer.send( b'0123456789abcdef0123456789abcdef012345678', self._encoder.encode_offset( self.partition_id, self.consumer.get_offset(self.partition_id))) return requests def get_next_requests(self, max_n_requests, **kwargs): return self._buffer.get_next_requests(max_n_requests, **kwargs) def finished(self): return False @property def metadata(self): return None @property def queue(self): return None @property def states(self): return None
def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) self.overused_buffer = OverusedBuffer( super(MemoryDFSOverusedBackend, self).get_next_requests)
def test_base(self): self.req_it = iter(self.requests) ob = OverusedBuffer(self.get_once, None, 100, None, 100) assert ob._get_pending_count() == 0 assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) assert ob._get_pending_count() == 4 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] assert ob._get_pending_count() == 3 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] assert ob._get_pending_count() == 3 #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set(ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) assert ob._get_pending_count() == 0 assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] assert ob._get_pending_count() == 0