def test_update_domain_queues(self):
        # test without scale factor
        self.scheduler.domain_config = {
            "ex1.com": {
                "window": 50,
                "hits": 10,
                "scale": 1
            }
        }
        q = RedisThrottledQueue(MagicMock(), MagicMock(), 100, 100)
        self.scheduler.queue_dict = {'link:ex1.com:queue': [q, 0]}

        self.scheduler.update_domain_queues()
        self.assertEqual(
            self.scheduler.queue_dict['link:ex1.com:queue'][0].window, 50)
        self.assertEqual(
            self.scheduler.queue_dict['link:ex1.com:queue'][0].limit, 10)

        # test with scale factor
        self.scheduler.domain_config = {
            "ex2.com": {
                "window": 50,
                "hits": 10,
                "scale": 0.5
            }
        }
        q = RedisThrottledQueue(MagicMock(), MagicMock(), 100, 100)
        self.scheduler.queue_dict = {'link:ex2.com:queue': [q, 0]}

        self.scheduler.update_domain_queues()
        self.assertEqual(
            self.scheduler.queue_dict['link:ex2.com:queue'][0].window, 50)
        # the scale factor effects the limit only
        self.assertEqual(
            self.scheduler.queue_dict['link:ex2.com:queue'][0].limit, 5)
    def create_queues(self):
        '''
        Updates the in memory list of the redis queues
        Creates new throttled queue instances if it does not have them
        '''
        # new config could have loaded between scrapes
        newConf = self.check_config()

        self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue")

        for key in self.queue_keys:
            # build final queue key, depending on type and ip bools
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.my_ip + ":"

            # add the tld from the key `type:tld:queue`
            the_domain = re.split(':', key)[1]
            throttle_key = throttle_key + the_domain

            if key not in self.queue_dict or newConf:
                self.logger.debug(
                    "Added new Throttled Queue {q}".format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key, encoding=ujson)

                # use default window and hits
                if the_domain not in self.domain_config:
                    # this is now a tuple, all access needs to use [0] to get
                    # the object, use [1] to get the time
                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, self.window,
                                            self.hits, self.moderated,
                                            throttle_key, throttle_key, True),
                        time.time()
                    ]
                # use custom window and hits
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']

                    # adjust the crawl rate based on the scale if exists
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(
                            self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, window, hits,
                                            self.moderated, throttle_key,
                                            throttle_key, True),
                        time.time()
                    ]
class TestModeratedElasticRedisThrottledQueue(TestCase):

    def setUp(self):
        self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 4, 2, True,
                                         elastic=True)

    def test_moderated(self):
        # test elastic kick in hasnt happened yet
        self.queue.is_moderated = MagicMock(return_value=True)
        self.queue.elastic_kick_in = 0
        self.assertFalse(self.queue.allowed())

        # kick in overrides, even though we were moderated
        self.queue.elastic_kick_in = self.queue.limit
        self.queue.check_elastic = MagicMock(return_value=True)
        self.queue.test_hits = MagicMock(return_value=True)
        self.assertTrue(self.queue.allowed())
Пример #4
0
class TestModeratedRedisThrottledQueue(TestCase):
    def setUp(self):
        self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 4, 2, True)

    def test_moderated(self):
        # a moderated queue should pop ~ every x seconds
        # we already tested the window limit in the unmoderated test
        self.queue.is_moderated = MagicMock(return_value=True)
        self.assertFalse(self.queue.allowed())

        self.queue.is_moderated = MagicMock(return_value=False)
        self.queue.test_hits = MagicMock(return_value=True)
        self.assertTrue(self.queue.allowed())

        # mock exception raised even with good moderation
        self.queue.test_hits = MagicMock(side_effect=WatchError)
        self.assertFalse(self.queue.allowed())
Пример #5
0
class TestModeratedRedisThrottledQueue(TestCase):

    def setUp(self):
        self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 4, 2, True)

    def test_moderated(self):
        # a moderated queue should pop ~ every x seconds
        # we already tested the window limit in the unmoderated test
        self.queue.is_moderated = MagicMock(return_value=True)
        self.assertFalse(self.queue.allowed())

        self.queue.is_moderated = MagicMock(return_value=False)
        self.queue.test_hits = MagicMock(return_value=True)
        self.assertTrue(self.queue.allowed())

        # mock exception raised even with good moderation
        self.queue.test_hits = MagicMock(side_effect=WatchError)
        self.assertFalse(self.queue.allowed())
Пример #6
0
    def create_throttle_queues(self):
        """
        创建限流队列
        :return:
        """
        new_conf = self.check_config()
        queue_key = '{spider_type}:{job_id}:*:queue'.format(
            spider_type=self.spider.name, job_id=self.job_id)
        self.queue_keys = self.redis_conn.keys(queue_key)
        for key in self.queue_keys:
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.ip + ":"

            the_domain = re.split(':', key)[2]
            throttle_key += the_domain

            if key not in self.queue_dict or new_conf:
                self.logger.debug(
                    "Added new Throttled Queue {q}".format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key)
                if the_domain not in self.domain_config:
                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, self.window,
                                            self.hits, self.moderated,
                                            throttle_key, throttle_key, True),
                        time.time()
                    ]
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(
                            self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, window, hits,
                                            self.moderated, throttle_key,
                                            throttle_key, True),
                        time.time()
                    ]
Пример #7
0
class TestUnmoderatedRedisThrottledQueue(TestCase):
    def setUp(self):
        # limit is 2 hits in the window
        self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 1, 2)

    def test_unmoderated(self):
        # an unmoderated queue is really just testing the number
        # of hits in a given window
        self.queue.redis_conn.zcard = MagicMock(return_value=0)
        self.assertTrue(self.queue.allowed())

        self.queue.redis_conn.zcard = MagicMock(return_value=1)
        self.assertTrue(self.queue.allowed())

        self.queue.redis_conn.zcard = MagicMock(return_value=2)
        self.assertFalse(self.queue.allowed())

        # mock exception raised even with good hits
        self.queue.redis_conn.zcard = MagicMock(return_value=0,
                                                side_effect=WatchError)
        self.assertFalse(self.queue.allowed())
Пример #8
0
class TestUnmoderatedRedisThrottledQueue(TestCase):

    def setUp(self):
        # limit is 2 hits in the window
        self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 1, 2)

    def test_unmoderated(self):
        # an unmoderated queue is really just testing the number
        # of hits in a given window
        self.queue.redis_conn.zcard = MagicMock(return_value=0)
        self.assertTrue(self.queue.allowed())

        self.queue.redis_conn.zcard = MagicMock(return_value=1)
        self.assertTrue(self.queue.allowed())

        self.queue.redis_conn.zcard = MagicMock(return_value=2)
        self.assertFalse(self.queue.allowed())

        # mock exception raised even with good hits
        self.queue.redis_conn.zcard = MagicMock(return_value=0,
                                                side_effect=WatchError)
        self.assertFalse(self.queue.allowed())
    def test_error_config(self):
        self.scheduler.domain_config = {"ex1.com": {"window": 50, "hits": 10}}
        self.scheduler.window = 7
        self.scheduler.hits = 5
        q = RedisThrottledQueue(MagicMock(), MagicMock(), 100, 100)
        self.scheduler.queue_dict = {'link:ex1.com:queue': [q, 0]}

        self.scheduler.error_config('stuff')

        self.assertEqual(
            self.scheduler.queue_dict['link:ex1.com:queue'][0].window, 7)
        self.assertEqual(
            self.scheduler.queue_dict['link:ex1.com:queue'][0].limit, 5)
        self.assertEqual(self.scheduler.domain_config, {})
Пример #10
0
def main():

    import argparse
    import redis
    import time

    import sys
    from os import path
    sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))

    from scutils.redis_queue import RedisPriorityQueue
    from scutils.redis_throttled_queue import RedisThrottledQueue

    parser = argparse.ArgumentParser(description="Throttled Queue Test Script."
                    " Start either a single or multiple processes to see the "
                " throttled queue mechanism in action.")
    parser.add_argument('-r', '--redis-host', action='store', required=True,
                        help="The Redis host ip")
    parser.add_argument('-p', '--redis-port', action='store', default='6379',
                        help="The Redis port")
    parser.add_argument('-m', '--moderate', action='store_const', const=True,
                        default=False, help="Moderate the outbound Queue")
    parser.add_argument('-w', '--window', action='store', default=60,
                        help="The window time to test")
    parser.add_argument('-n', '--num-hits', action='store', default=10,
                        help="The number of pops allowed in the given window")
    parser.add_argument('-q', '--queue', action='store', default='testqueue',
                        help="The Redis queue name")

    args = vars(parser.parse_args())

    window = int(args['window'])
    num = int(args['num_hits'])
    host = args['redis_host']
    port = args['redis_port']
    mod = args['moderate']
    queue = args['queue']

    conn = redis.Redis(host=host, port=port)

    q = RedisPriorityQueue(conn, queue)
    t = RedisThrottledQueue(conn, q, window, num, mod)

    def push_items(amount):
        for i in range(0, amount):
            t.push('item-'+str(i), i)

    print "Adding", num * 2, "items for testing"
    push_items(num * 2)

    def read_items():
        print "Kill when satisfied ^C"
        ti = time.time()
        count = 0
        while True:
            item = t.pop()
            if item:
                print "My item", item, "My time:", time.time() - ti
                count += 1

    try:
        read_items()
    except KeyboardInterrupt:
        pass
    t.clear()
    print "Finished"
Пример #11
0
def main():

    import argparse
    import redis
    import time

    import sys
    from os import path
    sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))

    from scutils.redis_queue import RedisPriorityQueue
    from scutils.redis_throttled_queue import RedisThrottledQueue

    parser = argparse.ArgumentParser(
        description="Throttled Queue Test Script."
        " Start either a single or multiple processes to see the "
        " throttled queue mechanism in action.")
    parser.add_argument('-r',
                        '--redis-host',
                        action='store',
                        required=True,
                        help="The Redis host ip")
    parser.add_argument('-p',
                        '--redis-port',
                        action='store',
                        default='6379',
                        help="The Redis port")
    parser.add_argument('-m',
                        '--moderate',
                        action='store_const',
                        const=True,
                        default=False,
                        help="Moderate the outbound Queue")
    parser.add_argument('-w',
                        '--window',
                        action='store',
                        default=60,
                        help="The window time to test")
    parser.add_argument('-n',
                        '--num-hits',
                        action='store',
                        default=10,
                        help="The number of pops allowed in the given window")
    parser.add_argument('-q',
                        '--queue',
                        action='store',
                        default='testqueue',
                        help="The Redis queue name")

    args = vars(parser.parse_args())

    window = int(args['window'])
    num = int(args['num_hits'])
    host = args['redis_host']
    port = args['redis_port']
    mod = args['moderate']
    queue = args['queue']

    conn = redis.Redis(host=host, port=port)

    q = RedisPriorityQueue(conn, queue)
    t = RedisThrottledQueue(conn, q, window, num, mod)

    def push_items(amount):
        for i in range(0, amount):
            t.push('item-' + str(i), i)

    print("Adding", num * 2, "items for testing")
    push_items(num * 2)

    def read_items():
        print("Kill when satisfied ^C")
        ti = time.time()
        count = 0
        while True:
            item = t.pop()
            if item:
                print("My item", item, "My time:", time.time() - ti)
                count += 1

    try:
        read_items()
    except KeyboardInterrupt:
        pass
    t.clear()
    print("Finished")
Пример #12
0
 def setUp(self):
     self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 4, 2, True)
Пример #13
0
 def setUp(self):
     # limit is 2 hits in the window
     self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 1, 2)
Пример #14
0
 def setUp(self):
     self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 4, 2, True,
                                      elastic=True)
Пример #15
0
 def setUp(self):
     # limit is 2 hits in the window
     self.queue = RedisThrottledQueue(MagicMock(), MagicMock(), 1, 2)