Пример #1
0
        def wrapper(func):
            for table in tables:
                # hash ring for cache update
                queues = [Queue() for _ in range(workers)]
                hash_ring = ketama.Continuum()
                for q in queues:
                    hash_ring[str(hash(q))] = q
                self.update_queues[table] = hash_ring

                cache_update = self._cache_update_gen(table, func, multi=multi)
                self.workers[table] = [
                    Worker("%s_cache_update" % table,
                           q,
                           cache_update,
                           multi=multi,
                           logger_name="%s.%s" % (self.name, table))
                    for q in queues
                ]

                # single worker for cache delete
                delete_q = Queue()
                self.delete_queues[table] = delete_q
                cache_delete = self._cache_delete_gen(table)
                self.workers[table].append(
                    Worker("%s_cache_delete" % table,
                           delete_q,
                           cache_delete,
                           multi=True,
                           logger_name="%s.%s" % (self.name, table)))

                self.socket.setsockopt(zmq.SUBSCRIBE, asbytes(table))
            return func
Пример #2
0
    def __init__(self, ketama_server_file):
        self.server_list = self.parse_server_file(ketama_server_file)
        self.continuum = ketama.Continuum(ketama_server_file)

        for hostname, port in self.server_list:
            server_string = "{0}:{1}".format(hostname, port)

            # creating a emtpy record for lazy connection responses.
            self.SERVERS.update({
                server_string: None,
            })
Пример #3
0
    def center_node_dispather(self):
        """主节点任务调度"""
        while True:
            self.logger.debug('获取新加入的URLs.........')
            tasks = []
            if self.server.llen('seeds'):
                tasks.append(self.server.lpop('seeds'))
            self.tasks.extend(tasks)

            state = self.spider_state_watcher()
            if state:
                self.logger.debug('遍历爬虫节点并依次暂停当前运行的爬虫..........')
                spider_ids = []
                spider_ip_ids = []
                for spider_key in self.spiders:
                    spider_ids.append(spider_key.split(':')[3])
                    spider_ip_ids.append((spider_key.split(':')[2], spider_key.split(':')[3]))
                for spider_ip_id in spider_ip_ids:
                    key = '{job}:status'.format(job=spider_ip_id[1])
                    self.server.set(key, 'pause')

                time.sleep(4)

                self.logger.debug('由于爬虫节点状态改变,调整哈希分布...........')
                self.chose = ketama.Continuum(spider_ids)

                self.logger.debug('调整爬虫节点所负责的站点数据抓取任务, 请勿在此段时间启动额外的爬虫..........')
                queue_keys = self.server.keys('*:queue')
                for queue_key in queue_keys:
                    tasks.extend(self.server.zrange(queue_key, 0, -1))  # 获取所有爬虫队列中的urls
                    self.server.zremrangebyrank(queue_key, 0, -1)  # 清空爬虫队列

                self.logger.debug('恢复先前暂停的爬虫节点.......')
                for spider_ip_id in spider_ip_ids:
                    key = '{job}:status'.format(job=spider_ip_id[1])
                    self.server.set(key, 'running')

            self.logger.debug('等待!, 重新分配URLs..............')
            for task_json in tasks:
                task = pickle.loads(task_json)
                if 'url' in task and 'spider_type' in task:
                    extract = tldextract.TLDExtract()
                    url = task['url']
                    spider_type = task['spider_type']
                    domain = extract(url).domain
                    job_id = self.chose[url.encode('utf-8')]
                    queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type,
                                                                               job_id=job_id,
                                                                               domain=domain)
                    priority = task['priority']
                    self.server.zadd(queue_key, pickle.dumps(task), priority)
                else:
                    self.logger.error("please input url and spider_type that you want to crawl!")
Пример #4
0
    def status_from_redis(self):
        self.create_throttle_queues()
        self.expire_queues()

        status = self.redis_conn.get('{job}:status'.format(job=self.job_id))
        if status == 'pause':  # 暂停爬虫 && 重置一致性分布
            self.paused = True
            spiders = self.redis_conn.keys('stats:spider:*:*')
            spider_ids = []
            for spider in spiders:
                spider_ids.append(spider.split(':')[3])
            self.chose = ketama.Continuum(spider_ids)
            return
        if status == 'running':
            self.paused = False
Пример #5
0
 def wrapper(func):
     for topic in topics:
         queues = [Queue() for _ in range(workers)]
         hash_ring = ketama.Continuum()
         for q in queues:
             hash_ring[str(hash(q))] = q
         self.worker_queues[topic] = hash_ring
         self.workers[topic] = WorkerPool(queues,
                                          topic,
                                          func,
                                          multi=multi,
                                          queue_limit=queue_limit,
                                          logger_name="%s.%s" %
                                          (self.name, topic))
         self.socket.setsockopt(zmq.SUBSCRIBE, asbytes(topic))
     return func
Пример #6
0
    def initial_seeds(self):
        """初始化调度器"""

        while True:
            initial_len = self.server.llen('seeds')
            if initial_len:
                break
            time.sleep(180)
            continue

        self.logger.debug('获取初始种子列表.........')
        while True:
            tasks = self.server.lrange('seeds', 0, -1)
            self.server.ltrim('seeds', -1, 0)
            self.tasks.extend(tasks)
            if self.tasks:
                break

        self.logger.debug('获取初始爬虫进程个数.........')
        self.spiders = self.server.keys('stats:spider:*:*')  # spiders列表
        self.spider_count = len(self.spiders)

        if self.spider_count:
            self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......')
            job_ids = []
            for spider in self.spiders:
                job_ids.append(spider.split(':')[3])
            self.chose = ketama.Continuum(job_ids)

            self.logger.debug('分配初始种子URLs队列........')
            for task_json in self.tasks:
                task = pickle.loads(task_json)
                if 'url' in task and 'spider_type' in task:
                    extract = tldextract.TLDExtract()
                    url = task['url']
                    spider_type = task['spider_type']
                    domain = extract(url).domain
                    job_id = self.chose[url.encode('utf-8')]
                    queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type,
                                                                               job_id=job_id,
                                                                               domain=domain)
                    priority = task['priority']
                    self.server.zadd(queue_key, pickle.dumps(task), priority)
                else:
                    self.logger.error("please input url and spider_type that you want to crawl!")
Пример #7
0
def test_ketama_compatibility(ketama_config_file):
    if not ketama:
        return

    ring = HashRing(
        nodes={"127.0.0.1:11211": 600, "127.0.0.1:11212": 400},
        replicas=4,
        vnodes=40,
        compat=True,
    )
    continuum = ketama.Continuum(ketama_config_file)

    assert ring.get_points() == continuum.get_points()

    numhits = 1000
    numvalues = 10000
    for i in range(numhits):
        key = str(randint(1, numvalues))
        assert ring.get_server(key) == continuum.get_server(key)
Пример #8
0
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'),
                             db=settings.get('REDIS_DB'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', True)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')
        backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True)
        queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600)

        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        # spider_ids = ['1', ]
        spider_ids = ['1', '2', '3']
        chose = ketama.Continuum(spider_ids)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex,
                   backlog_blacklist, queue_timeout, chose)
Пример #9
0
  def test_points(self):
    ring = ketama.Continuum(self.server_list_file)

    for i in xrange(0, 100000):
      print i, ring.get_server(str(i))[1]
Пример #10
0
 def test_points(self):
     cont = ketama.Continuum(self.valid_list_file)
     self.assertEqual(len(cont.get_points()), 160 * 3)
Пример #11
0
 def test_server_modified_count(self):
     cont = ketama.Continuum(self.valid_list_file)
     self.assertEqual(cont.get_server_count(), 3)
Пример #12
0
 def test_removal(self):
     cont = ketama.Continuum(self.valid_list_file)
     cont.remove_server("127.0.0.1:11211")
     self.assertTrue(1)
Пример #13
0
 def test_hashing(self):
     cont = ketama.Continuum(self.valid_list_file)
     self.assertEqual(cont.get_server("test"),
         (2959911115, '127.0.0.1:11211'))
Пример #14
0
 def test_valid(self):
     cont = ketama.Continuum(self.valid_list_file)
     self.assertEqual(type(cont), ketama.Continuum)
Пример #15
0
import ketama
import sys

connections = {}
filename = ''
if len(sys.argv) < 2:
    print "Usage: test_python.py key_to_be_tested"
    sys.exit()
elif len(sys.argv) == 3:
    filename = sys.argv[2]
    print 'Testing file: ' + filename
    cont = ketama.Continuum(filename)
else:
    cont = ketama.Continuum('key:12324')

test_key = sys.argv[1]

print "Testing key: " + test_key
servers = open('../ketama.servers')
for server in servers:
    server_info = server.split()
    server_name = server_info[0]
    memory = int(server_info[1])
    cont.add_server(server_name, memory)
    print "Adding server: " + server_name + ":" + str(memory)

info = cont.get_info()
print info
cont.sync_servers("node1:1000,node2:1000,node3:1000,node4:1000")
print cont.get_info()
Пример #16
0
from time import time
from tempfile import NamedTemporaryFile
from uhashring import HashRing

num = 1000000
print('running {} key generation comparison'.format(num))

# ketama C binding
if ketama:
    with NamedTemporaryFile(prefix='benchmark_') as ketama_config_file:
        ketama_config_file.write("127.0.0.1:11211\t600\n")
        ketama_config_file.write("127.0.0.1:11212\t400\n")
        ketama_config_file.flush()

        kt = ketama.Continuum(ketama_config_file.name)
        pt = time()
        for i in range(num):
            key = 'myval-{}'.format(i)
            kt.get_server(key)
        print('ketama took {} ms'.format(time() - pt))

# pure python implementation
ring = HashRing(
    nodes={'127.0.0.1:11211': 600,
           '127.0.0.1:11212': 400},
    replicas=4,
    vnodes=40,
    compat=True)
pt = time()
for i in range(num):
Пример #17
0
    def test_adding(self):
        cont = ketama.Continuum(self.valid_list_file)
	old_count = cont.get_server_count()
        cont.add_server("127.0.0.1:11213", 700)
	self.assertEqual(cont.get_server_count(), old_count + 1)