Пример #1
0
    def test_1_consistent_while_slave_is_in_load(self):
        util.print_frame()
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(ip)
        gw.connect(ip, port)

        max_key = 5
        key_base = 'load_gen_key'
        for idx in range(max_key):
            cmd = 'set %s%d 0\r\n' % (key_base, idx)
            gw.write(cmd)
            gw.read_until('\r\n', 10)

        try_count = 9999
        for value in range(try_count):
            for idx in range(max_key):
                cmd = 'set %s%d %d\r\n' % (key_base, idx, value)
                gw.write(cmd)
                response = gw.read_until('\r\n', 10)
                self.assertEquals(response, '+OK\r\n')

                cmd = 'get %s%d\r\n' % (key_base, idx)
                gw.write(cmd)
                response = gw.read_until('\r\n', 10)
                response = gw.read_until('\r\n', 10)

                self.assertEquals(
                    response, '%s\r\n' % (value),
                    'fail! original_value:%d, return_from_slave:%s' %
                    (value, response[1:]))
Пример #2
0
    def test_migrate_empty_s3obj(self):
        util.print_frame()
        ip, port = util.get_rand_gateway(self.cluster)
        client = redis_sock.RedisClient(ip, port)

        # Fill some string and empty s3 objects
        keyprefix = 'test_migrate_empty_s3obj'
        for i in range(1000):
            ok, data = client.do_request('set %s_string_%d %d\r\n' %
                                         (keyprefix, i, i))
            assert (ok == True)
            ok, data = client.do_request(
                's3ladd ks %s_s3_%d svc key val 0\r\n' % (keyprefix, i))
            assert (ok == True and data == 1)
            ok, data = client.do_request('s3lrem ks %s_s3_%d svc key val\r\n' %
                                         (keyprefix, i))
            assert (ok == True and data == 1)

        ## migration pg0 -> pg1 then pg1 -> pg0
        ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')
        ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')

        # Check string object
        for i in range(1000):
            ok, data = client.do_request('get %s_string_%d\r\n' %
                                         (keyprefix, i))
            assert (ok == True and int(data) == i)

        client.close()
Пример #3
0
    def test_migrate_empty_s3obj(self):
        util.print_frame()
        ip, port = util.get_rand_gateway(self.cluster)
        client = redis_sock.RedisClient(ip, port)

        # Fill some string and empty s3 objects
        keyprefix = 'test_migrate_empty_s3obj'
        for i in range (1000):
            ok, data = client.do_request('set %s_string_%d %d\r\n' % (keyprefix, i, i))
            assert (ok == True)
            ok, data = client.do_request('s3ladd ks %s_s3_%d svc key val 0\r\n' % (keyprefix, i))
            assert (ok == True and data == 1)
            ok, data = client.do_request('s3lrem ks %s_s3_%d svc key val\r\n' % (keyprefix, i))
            assert (ok == True and data == 1)

        ## migration pg0 -> pg1 then pg1 -> pg0
        ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')
        ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')

        # Check string object
        for i in range (1000):
            ok, data = client.do_request('get %s_string_%d\r\n' % (keyprefix, i))
            assert (ok == True and int(data) == i)

        client.close()
    def test_1_consistent_while_slave_is_in_load( self ):
        util.print_frame()
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( ip )
        gw.connect( ip, port )

        max_key = 5
        key_base = 'load_gen_key'
        for idx in range( max_key ):
            cmd = 'set %s%d 0\r\n' % (key_base, idx)
            gw.write( cmd )
            gw.read_until( '\r\n', 10 )

        try_count = 9999
        for value in range( try_count ):
            for idx in range( max_key ):
                cmd = 'set %s%d %d\r\n' % (key_base, idx, value)
                gw.write( cmd )
                response = gw.read_until( '\r\n', 10 )
                self.assertEquals( response, '+OK\r\n' )

                cmd = 'get %s%d\r\n' % (key_base, idx)
                gw.write( cmd )
                response = gw.read_until( '\r\n', 10 )
                response = gw.read_until( '\r\n', 10 )

                self.assertEquals( response, '%s\r\n' % (value), 'fail! original_value:%d, return_from_slave:%s' % (value, response[1:]) )
Пример #5
0
    def pgs_add_and_del( self, upgrade_server, type ):
        util.print_frame()

        util.log( '[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type) )
        util.log_server_state( self.cluster )

        # start load generator
        load_gen_list = {}
        for i in range( len(self.cluster['servers']) ):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # set new values
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway( '0' )
        gw.connect( ip, port )
        for i in range( 0, 50 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) )

        # attach pgs from cluster
        cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        time.sleep( 3 )

        # check new values
        redis = redis_mgmt.Redis( upgrade_server['id'] )
        ret = redis.connect( upgrade_server['ip'], upgrade_server['redis_port'] )
        self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port']) )

        for i in range( 0, 50 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis.write( cmd )
            redis.read_until( '\r\n' )
            res = redis.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i) )
        util.log( 'succeeded : check values with get operations on pgs%d.' % (upgrade_server['id']) )

        # shutdown load generators
        for i in range( len(load_gen_list) ):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        util.log_server_state( self.cluster )

        return 0
Пример #6
0
    def test_basic_op_gateway(self):
        util.print_frame()
        ip, port = util.get_rand_gateway(self.cluster)
        f = open("%s/test_basicop_output_gw" % constant.logdir, 'w')
        p = util.exec_proc_async("../redis", "./runtest_gw --accurate --gw-port "+str(port), True, None, f, None)

        ret = p.wait()
        f.close()
        self.assertEquals(0, ret)
Пример #7
0
    def test_random_pgs_del_and_add(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
                i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("started load_generator")

        servers = self.cluster['servers']
        gw_list = []
        for server in servers:
            gw = {}
            gw['mgmt'] = telnetlib.Telnet(server['ip'],
                                          server['gateway_port'] + 1)
            gw['normal'] = telnetlib.Telnet(server['ip'],
                                            server['gateway_port'])
            gw_list.append(gw)

        count = 10
        while count > 0:
            c = random.choice(servers)
            for gw in gw_list:
                gw['mgmt'].write("pgs_del %d %d\r\n" % (c['id'], c['pg_id']))
                gw['mgmt'].read_until("+OK\r\n")

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")

            for gw in gw_list:
                gw['mgmt'].write(
                    "pgs_add %d %d %s %d\r\n" %
                    (c['id'], c['pg_id'], c['ip'], c['redis_port']))
                gw['mgmt'].read_until("+OK\r\n")

            for gw in gw_list:
                while True:
                    gw['normal'].write("info gateway\r\n")
                    ret = gw['normal'].read_until("\r\n\r\n")
                    if "gateway_disconnected_redis:0\r\n" in ret:
                        break

            count -= 1

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after gateway_mgmt test')
Пример #8
0
    def test_basic_op_gateway(self):
        util.print_frame()
        ip, port = util.get_rand_gateway(self.cluster)
        f = open("%s/test_basicop_output_gw" % constant.logdir, 'w')
        p = util.exec_proc_async("../redis-2.8.8",
                            "./runtest_gw --accurate --gw-port "+str(port),
                            True, None, f, None)

        ret = p.wait()
        f.close()
        self.assertEquals(0, ret)
Пример #9
0
    def test_random_pgs_del_and_add(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("started load_generator")

        servers = self.cluster['servers']
        gw_list = []
        for server in servers:
            gw = {}
            gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port']+1)
            gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port'])
            gw_list.append(gw)

        count = 10
        while count > 0:
            c = random.choice(servers)
            for gw in gw_list:
                gw['mgmt'].write("pgs_del %d %d\r\n" % (c['id'], c['pg_id']))
                gw['mgmt'].read_until("+OK\r\n")

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")

            for gw in gw_list:
                gw['mgmt'].write("pgs_add %d %d %s %d\r\n" % (c['id'], c['pg_id'], c['ip'], c['redis_port']))
                gw['mgmt'].read_until("+OK\r\n")

            for gw in gw_list:
                while True:
                    gw['normal'].write("info gateway\r\n")
                    ret = gw['normal'].read_until("\r\n\r\n")
                    if "gateway_disconnected_redis:0\r\n" in ret:
                        break

            count -= 1

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
Пример #10
0
    def test_migrate_all(self):
        util.print_frame()
        migration_count = 10
        # start load generator
        load_gen_thrd_list = {}
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i,
                                                                 ip,
                                                                 port,
                                                                 ops_limit=500)
            load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec

        # start migration
        for i in range(migration_count):
            # pg0 -> pg1
            ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail')
            # pg0 <- pg1
            ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail')

            ok = True
            for j in range(len(load_gen_thrd_list)):
                if load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')
Пример #11
0
    def put_some_data( self ):
        # start load generator
        max_load_generator = 100
        load_gen_thrd_list = {}
        util.log('start load_generator')
        for i in range(max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(10) # generate some load

        util.log('end load_generator')

        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Data are inconsistent.')
        return 0
    def put_some_data( self ):
        # start load generator
        max_load_generator = 100
        load_gen_thrd_list = {}
        util.log('start load_generator')
        for i in range(max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(10) # generate some load

        util.log('end load_generator')

        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Data are inconsistent.')
        return 0
Пример #13
0
    def test_single_thread_input( self ):
        util.print_frame()
        self.cluster = config.clusters[0]
        result = {}

        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( ip )
        self.assertEquals( 0, gw.connect( ip, port ) )

        max = 5
        for idx in range( max ):
            cmd = 'set key%d 0\r\n' % (idx)
            gw.write( cmd )
            result[idx] = gw.read_until( '\r\n' )

        data_max = 65535
        for idx in range( max ):
            for cnt in range( 0, data_max ):
                gw.write( 'crc16 key%d %d\r\n' % (idx, cnt) )
                result[idx] = gw.read_until( '\r\n' )

        for idx in range( max - 1 ):
            self.assertEquals( result[idx], result[idx + 1] )
Пример #14
0
    def test_single_thread_input(self):
        util.print_frame()
        self.cluster = config.clusters[0]
        result = {}

        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(ip)
        self.assertEquals(0, gw.connect(ip, port))

        max = 5
        for idx in range(max):
            cmd = 'set key%d 0\r\n' % (idx)
            gw.write(cmd)
            result[idx] = gw.read_until('\r\n')

        data_max = 65535
        for idx in range(max):
            for cnt in range(0, data_max):
                gw.write('crc16 key%d %d\r\n' % (idx, cnt))
                result[idx] = gw.read_until('\r\n')

        for idx in range(max - 1):
            self.assertEquals(result[idx], result[idx + 1])
Пример #15
0
    def test_migrate_all(self):
        util.print_frame()
        migration_count = 10
        # start load generator
        load_gen_thrd_list = {}
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5) # generate load for 5 sec

        # start migration
        for i in range(migration_count):
            # pg0 -> pg1
            ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail')
            # pg0 <- pg1
            ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail')

            ok = True
            for j in range(len(load_gen_thrd_list)):
                if load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break;

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
Пример #16
0
    def test_delete_smrlog_after_scaleout(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_thrd_list[i].start()

        time.sleep(5) # generate load for 5 sec
        util.log("started load_generator")

        # servers for scale out
        servers = [config.server4, config.server5, config.server6]
        leader_cm = self.cluster['servers'][0]

        # Scale out
        cluster = config.clusters[0]
        ret = util.pg_add(cluster, servers, leader_cm)
        self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false')

        time.sleep(5)
        # pg0 -> pg1
        cluster = config.clusters[1]
        ret = util.migration(cluster, 0, 1, 8000, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail 0 -> 1')

        # get log file
        old_logs = {}
        for s in config.clusters[0]['servers']:
            parent_dir, log_dir = util.smr_log_dir(s['id'])
            path = '%s/%s' % (parent_dir, log_dir)
            old_logs[s['id']] = util.ls(path)

        # bgsave in order to make smrlogs deleted.
        for s in config.clusters[0]['servers']:
            bgsave_ret = util.bgsave(s)
            self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % s['id'])
            util.log('bgsave pgs%d is done.')

        # check consistency
        ok = True
        for j in range(len(self.load_gen_thrd_list)):
            self.assertTrue(self.load_gen_thrd_list[j].isConsistent(),
                    'Inconsistent after migration')

        # is smr-replicator delete smrlogs?
        i = 0
        while i < 20:
            i += 1
            # get current log files
            cur_logs = {}
            for s in config.clusters[0]['servers']:
                parent_dir, log_dir = util.smr_log_dir(s['id'])
                path = '%s/%s' % (parent_dir, log_dir)
                cur_logs[s['id']] = util.ls(path)

            # compare old and new
            temp_old_logs = copy.deepcopy(old_logs)
            for id, nl in cur_logs.items():
                ol = temp_old_logs.get(id)
                self.assertNotEqual(ol, None, "failed to check logfiles. old logs for smr-replicator '%d' is not exist." % id)

                for log in nl:
                    if log in ol:
                        ol.remove(log)

            ok = True
            for id, ol in temp_old_logs.items():
                if len(ol) == 0:
                    ok = False

            util.log('Loop %d ---------------------------------------------------------' % i)
            util.log('deleted smrlog files: %s' % util.json_to_str(temp_old_logs))

            if ok:
                break

            time.sleep(10)

        self.assertTrue(ok, 'smr-replicator does not delete smrlogs.')
        util.log('smr-replicator deletes smrlogs.')

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
Пример #17
0
    def test_scaleout(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
                i, ip, port)
            self.load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        util.log("started load_generator")

        # servers for scale out
        servers = [config.server4, config.server5, config.server6]
        leader_cm = self.cluster['servers'][0]

        # start migration
        migration_count = 5
        for i in range(migration_count):
            # Scale out
            cluster = config.clusters[0]
            ret = util.pg_add(cluster, servers, leader_cm)
            self.assertEqual(True, ret,
                             'Scale out fail. util.pg_add returns false')

            time.sleep(5)
            # pg0 -> pg1
            cluster = config.clusters[1]
            ret = util.migration(cluster, 0, 1, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail 0 -> 1')

            # pg0 <- pg1
            cluster = config.clusters[1]
            ret = util.migration(cluster, 1, 0, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail 1 <- 0')

            # Scale in

            #TODO Temporary
            #cluster = config.clusters[0]
            #for server in cluster['servers']:
            #    if testbase.request_to_shutdown_hbc(server) is not 0:
            #        util.log('scale in : failed to request to shutdown hbc')
            #        self.assertFalse('scale in : failed to request to shutdown hbc')
            #time.sleep(5)
            ###############

            cluster = config.clusters[1]
            ret = util.pg_del(cluster, servers, leader_cm)
            self.assertEqual(True, ret,
                             'Scale in fail. util.pg_del returns false')

            #TODO Temporary
            #cluster = config.clusters[0]
            #for server in cluster['servers']:
            #    if testbase.request_to_start_heartbeat_checker( server ) is not 0:
            #        util.log('scale in : failed to start hbc')
            #        self.assertFalse('scale in : failed to start hbc')
            #time.sleep(5)
            ###############

            # check consistency
            ok = True
            for j in range(len(self.load_gen_thrd_list)):
                if self.load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')
Пример #18
0
    def state_transition(self):
        server = util.get_server_by_role(self.cluster['servers'], 'slave')
        self.assertNotEquals(server, None,
                             'failed to get_server_by_role-slave')

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])

        # check initial state
        state = self.get_expected_smr_state(server, 'N')
        role = util.get_role_of_server(server)
        self.assertEquals(
            'N', state, 'server%d - state:%s, role:%s, expected:N' %
            (server['id'], state, role))

        # shutdown
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        time.sleep(3)

        # check state F
        expected = 'F'
        state = self.get_expected_smr_state(server, expected)
        self.assertEquals(
            expected, state, 'server%d - state:%s, but expected:%s' %
            (server['id'], state, expected))

        # set value
        ret = gw.connect(ip, port)
        self.assertEquals(ret, 0,
                          'failed to connect to gateway, %s:%d' % (ip, port))
        timestamp = 0.0
        for i in range(0, 100):
            timestamp = time.time()
            key = 'new_key_haha'
            cmd = 'set %s %f\r\n' % (key, timestamp)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # recovery
        ret = testbase.request_to_start_smr(server)
        self.assertEquals(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEquals(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server, 10)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))
        time.sleep(5)

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        expected = 'N'
        max_try = 20
        for i in range(0, max_try):
            state = self.get_expected_smr_state(server, expected)
            if state == expected:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, role:%s, but expected:%s' %
            (server['id'], state, role, expected))
Пример #19
0
    def consistent_after_failover(self):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(ip)
        gw.connect(ip, port)

        for i in range(0, max):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        time.sleep(5)

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr(server)
            self.assertEqual(
                ret, 0, 'failed to shutdown smr, server:%d' % server['id'])
            ret = testbase.request_to_shutdown_redis(server)
            self.assertEquals(ret, 0, 'failed to shutdown redis')
        time.sleep(5)

        # check state F
        for server in servers:
            state = self.get_expected_smr_state(server, 'F')
            self.assertEquals('F', state,
                              'server%d - state:%s' % (server['id'], state))

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(ret, 0,
                             'failed to start smr, server:%d' % server['id'])

            ret = testbase.request_to_start_redis(server, False)
            self.assertEqual(ret, 0,
                             'failed to start redis, server:%d' % server['id'])

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep(5)

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.leader_cm['ip'],
                                     self.leader_cm['cm_port'])
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role(
                server, wait_count)
            self.assertEquals(
                ret, 0, 'failed to role change. server:%d' % (server['id']))

            state = self.get_expected_smr_state(server, 'N')
            role = util.get_role_of_server(server)
            self.assertEquals(
                'N', state,
                'server%d - state:%s, role:%s' % (server['id'], state, role))

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server(server)
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue(
            1 == the_number_of_master and 2 == the_number_of_slave,
            'failed to set roles, the number of master:%d, the number of slave:%d'
            % (the_number_of_master, the_number_of_slave))

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis(master['id'])
        ret = redis.connect(master['ip'], master['redis_port'])
        self.assertEquals(
            ret, 0, 'failed to connect to redis, server:%d' % master['id'])

        for i in range(max, max * 2):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write(cmd)
            res = redis.read_until('\r\n')
            self.assertEquals(
                res, '+OK\r\n',
                'failed to get response, server:%d' % master['id'])
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis(slave['id'])
            ret = slave_redis.connect(slave['ip'], slave['redis_port'])
            self.assertEquals(
                ret, 0, 'failed to connect to redis, server:%d' % slave['id'])

            for i in range(0, max * 2):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write(cmd)
                trash = slave_redis.read_until('\r\n')
                res = slave_redis.read_until('\r\n')
                self.assertEquals(
                    res, '%d\r\n' % i,
                    'inconsistent, server:%d, expected %d but %s' %
                    (slave['id'], i, res))
            slave_redis.disconnect()
Пример #20
0
    def test_migration_with_expire_command(self):
        util.print_frame()

        util.log("start load_generator")
        load_gen_thrd_list = {}
        for i in range(1):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5) # generate load for 5 sec
        tps = 20000
        src_pg_id = 0
        dst_pg_id = 1
        leader_cm = self.cluster['servers'][0]
        src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id)
        dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id)

        smr = smr_mgmt.SMR(src_master['id'])
        ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port'])
        if ret != 0:
            util.log('failed to connect to smr(source master)')
            return False

        src_redis = redis_mgmt.Redis(src_master['id'])
        ret = src_redis.connect(src_master['ip'], src_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        ts = time.time()
        self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20)

        self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts))

        ts = time.time()
        self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20)

        # notify dst_redis of migration start
        util.log(">>> notify dst_redis of migration start (%s)" % time.asctime())

        cmd = 'migconf migstart %d-%d\r\n' % (0, 8191)
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        # remote partial checkpoint
        util.log(">>> start remote checkpoint and load (%s)" % time.asctime())
        cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % (
                    src_master['ip'], src_master['redis_port'],
                    dst_master['ip'], dst_master['redis_port'],
                    0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None)

        ret = p.wait()
        for line in p.stdout:
            if line.find("Checkpoint Sequence Number:") != -1:
                util.log("seqnumber : " + line[line.rfind(":")+1:])
                seq = int(line[line.rfind(":")+1:])
            util.log(">>>" + str(line.rstrip()))

        self.assertEqual(0, ret)
        util.log(">>> end remote checkpoint and load (%s)" % time.asctime())

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        # bgsave for testing later about recovery during migration
        util.log(">>> bgsave for testing later about recovery during migration (%s)" % time.asctime())
        cmd = 'bgsave\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+Background saving started\r\n' )

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100)

        # remote catchup (smr log migration)
        util.log(">>> start remote catchup (%s)" % time.asctime())

        dst_host = dst_master['ip']
        dst_smr_port = dst_master['smr_base_port']
        rle = '1 8192'
        num_part = 8192

        smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port,
                                                     seq, tps, num_part, rle))
        response = smr.read_until('\r\n')
        if response[:3] != '+OK':
            util.log('failed to execute migrate start command, response:%s' % response)
            return False

        while True:
            smr.write('migrate info\r\n')
            response = smr.read_until('\r\n')
            seqs = response.split()
            logseq = int(seqs[1].split(':')[1])
            mig = int(seqs[2].split(':')[1])
            util.log('migrate info: %s' % response)
            if (logseq-mig < 500000):
                util.log('Remote catchup almost done. try mig2pc')
                break
            time.sleep(1)

        util.log(">>> sleep until 90 sec pass")
        self.assertFalse(time.time() - ts >= 90)
        time.sleep(90 - (time.time() - ts))

        res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20)

        util.log(">>> remote catchup phase almost done (%s)" % time.asctime())

        # mig2pc
        util.log(">>> start mig2pc (%s)" % time.asctime())

        cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id,
                                         0, 8191)
        result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
        util.log('mig2pc result : ' + result)
        if not result.startswith('{"state":"success","msg":"+OK"}\r\n'):
            util.log('failed to execute mig2pc command, result:%s' % result)
            return False

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10)
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20)

        # finish migration
        smr.write('migrate interrupt\r\n')
        response = smr.read_until('\r\n')
        util.log('migrate interrupt: %s' % response)
        smr.disconnect()

        # notify dst_redis of migration end
        util.log(">>> notify dst_redis of migration end (%s)" % time.asctime())

        cmd = 'migconf migend\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191)
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        # remote partial checkpoint
        util.log(">>> start rangedel (%s)" % time.asctime())
        cmd = "./cluster-util --rangedel %s %d %d-%d %d" % (
                    src_master['ip'], src_master['redis_port'],
                    0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None)
        ret = p.wait()

        for line in p.stdout:
            util.log(">>>" + str(line.rstrip()))

        cmd = 'migconf clearend\r\n'
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEqual(res, '+OK\r\n')

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')

        # kill dst_redis and recover from bgsave
        util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime())

        dst_redis.disconnect()
        ret = testbase.request_to_shutdown_redis(dst_master)
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        ret = testbase.request_to_shutdown_smr(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_master)
        self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_master['id'] )

        ret = testbase.request_to_start_redis(dst_master)
        self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_master['id']  )

        ret = testbase.wait_until_finished_to_set_up_role(dst_master)
        self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id']) )

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis, 'S3:PermanentKey')

        # kill dst_slave redis and recover without dump file
        util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime())
        dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id)

        ret = testbase.request_to_shutdown_redis(dst_slave)
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        ret = testbase.request_to_shutdown_smr(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_slave)
        self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_slave['id'] )

        ret = testbase.request_to_start_redis(dst_slave)
        self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_slave['id']  )

        ret = testbase.wait_until_finished_to_set_up_role(dst_slave)
        self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id']) )

        dst_redis_slave = redis_mgmt.Redis(dst_slave['id'])
        ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis_slave, 'S3:PermanentKey')

        # Go back to initial configuration
        self.assertTrue(util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000),
                'failed to rollback migration')
Пример #21
0
    def test_5_transfer_pgs_to_another_machine(self):
        util.print_frame()

        self.load_gen_list = {}

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # incrase master generation number
        util.log('failover in order to increase master generation number.')
        max = 0
        for i in range(5):
            key_base = 'key'
            for i in range(max, max + 10000):
                cmd = 'set %s%d %d\r\n' % (key_base, i, i)
                gw.write(cmd)
                res = gw.read_until('\r\n')
                self.assertEquals(res, '+OK\r\n')
            max = max + 10000

            m = util.get_server_by_role(self.cluster['servers'], 'master')
            util.log('failover pgs%d' % m['id'])
            ret = util.failover(m, self.leader_cm)
            self.assertTrue(ret, 'failed to failover pgs%d' % m['id'])

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        util.log("started load_generator")

        m, s1, s2 = util.get_mss(self.cluster)
        servers = [m, s1, s2]

        # bgsave
        for s in servers:
            ret = util.bgsave(s)
            self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id'])

        new_servers = [config.server4, config.server5]

        # add new slaves
        for s in new_servers:
            util.log('delete pgs%d`s check point.' % s['id'])
            util.del_dumprdb(s['id'])

            ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'],
                                            'dump.rdb', 0, 8191)
            self.assertEqual(
                True, ret,
                'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d'
                % (m['ip'], m['redis_port'], s['id']))

            ret = util.install_pgs(self.cluster,
                                   s,
                                   self.leader_cm,
                                   0,
                                   rm_ckpt=False)
            self.assertEqual(
                True, ret,
                'failed : util.pgs_add returns false, pgsid=%d' % s['id'])
            util.log('succeeeded : add a new slave, pgsid=%d' % s['id'])

            # check consistency
            ok = True
            for j in range(self.max_load_generator):
                if self.load_gen_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break

        for server_to_del in servers:
            for s in servers:
                util.pingpong(s['ip'], s['smr_mgmt_port'])
            for s in new_servers:
                util.pingpong(s['ip'], s['smr_mgmt_port'])
            self.__del_server(server_to_del)
            util.log('succeeded : delete pgs%d' % server_to_del['id'])

        new_m = util.get_server_by_role(new_servers, 'master')
        new_s = util.get_server_by_role(new_servers, 'slave')
        self.assertNotEqual(new_m, None, 'master is None.')
        self.assertNotEqual(new_s, None, 'slave is None.')

        for s in new_servers:
            util.pingpong(s['ip'], s['smr_mgmt_port'])

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(),
                            'Inconsistent after migration')
            self.load_gen_list.pop(i, None)

        # Go back to initial configuration
        # recover pgs
        for s in servers:
            self.assertTrue(
                util.install_pgs(self.cluster,
                                 s,
                                 self.leader_cm,
                                 rm_ckpt=False),
                'failed to recover pgs. (install_pgs)')

        # cleanup new slaves
        for s in new_servers:
            self.assertTrue(
                util.uninstall_pgs(self.cluster, s, self.leader_cm),
                'failed to cleanup pgs. (uninstall_pgs)')
Пример #22
0
    def master_failover_while_hang(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        self.failover_while_hang(m)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(m['id'])
        ret = redis1.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))
            util.log(
                'succeeded : check values with set/get operations with pgs%d and pgs%d.'
                % (m['id'], s2['id']))

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #23
0
    def master_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(5)

        # wait for forced master election
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server(s2)
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep(1)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        self.assertEqual(success, True, 'failed to forced master election')

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check if the haning server recovered and joined as a slave
        time.sleep(7)
        role = util.get_role_of_server(m)
        self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave')

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #24
0
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Пример #25
0
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        success = False
        for i in xrange(20):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.mgmt_ip,
                                     self.mgmt_port,
                                     check_quorum=True)
            if ret:
                success = True
                break
            time.sleep(1)
        self.assertEqual(success, True, 'unstable cluster')

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #26
0
    def master_and_slave_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave = smr_mgmt.SMR(s1['id'])
        ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr_master.write('fi delay sleep 1 10000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave.write('fi delay sleep 1 10000\r\n')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        time.sleep(5)

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range(15):
                state = []
                util.check_cluster(self.cluster['cluster_name'],
                                   self.leader_cm['ip'],
                                   self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep(1)

            util.log('')
            util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2')
            util.log('')
            util.log_server_state(self.cluster)

            self.assertEqual(success, True, 'failed to check copy-quorum')

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'],
                                        self.leader_cm['ip'],
                                        self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue(ok, 'Cluster state is not normal!')

            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write(cmd)
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis1. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis1(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write(cmd)
                res = redis0.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis0. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #27
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers,
                                     target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(
                    0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm,
                                     self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1,
                         'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 2
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs(s)
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(
                old_ts, new_ts,
                'Timestamp of a running server has changed. %d->%d' %
                (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state(self.cluster)

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check cluster state
        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(),
                            'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Пример #28
0
    def test_restart_recovery_with_remote_checkpoint_and_remote_log(self):
        util.print_frame()
        key_base = 'key'
        target = util.get_server_by_role(self.cluster['servers'], 'slave')
        master = util.get_server_by_role(self.cluster['servers'], 'master')

        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(master['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0, 'failed to connect to gateway')

        # set initial data in order to make an elapsed time for bgsave longer
        self.put_some_data()

        # generate some data
        for i in range(0, 100):
            key = '%s%d' % (key_base, i)
            cmd = 'set %s %d\r\n' % (key, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # delete a local checkpoint
        util.log('delete pgs%d`s check point.' % target['id'])
        util.del_dumprdb(target['id'])

        # generate a remote check point
        bgsave_ret = util.bgsave(master)
        self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown
        util.log('shutdown target')
        ret = testbase.request_to_shutdown_smr(target)
        self.assertEqual(ret, 0, 'failed to shutdown smr')

        time.sleep(10)

        # generate some data
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0, 'failed to connect to gateway')
        for i in range(100, 200):
            key = '%s%d' % (key_base, i)
            cmd = 'set %s %d\r\n' % (key, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # recovery
        util.log('recovery target')
        ret = testbase.request_to_start_smr(target)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(target)
        self.assertEqual(ret, 0, 'failed to start redis')
        time.sleep(5)

        ret = testbase.wait_until_finished_to_set_up_role(target)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (target['id']))

        # check value
        recovered_redis = redis_mgmt.Redis(target['id'])
        ret = recovered_redis.connect(target['ip'], target['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        for i in range(0, 200):
            key = '%s%d' % (key_base, i)
            cmd = 'get %s\r\n' % (key)
            recovered_redis.write(cmd)
            recovered_redis.read_until('\r\n')
            response = recovered_redis.read_until('\r\n')
            self.assertEqual(response, '%d\r\n' % i,
                             'inconsistent %s, %d' % (response, i))
Пример #29
0
    def recovery_with_local_checkpoint_and_remote_log(self, role):
        server = util.get_server_by_role(self.cluster['servers'], role)

        # set initial data in order to make an elapsed time for bgsave longer
        self.put_some_data()

        # set value
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(server['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, id:%d' % server['id'])
        timestamp = {}
        key_base = 'key0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999'
        for i in range(0, 50000):
            timestamp[i] = time.time()
            k = '%s_%d' % (key_base, i)
            cmd = 'set %s %f\r\n' % (k, timestamp[i])
            gw.write(cmd)
            response = gw.read_until('\r\n')
            self.assertNotEqual(response.find('+OK'), -1,
                                'failed to set key value through gateway')

        # generate a check point
        bgsave_ret = util.bgsave(server)
        self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % server['id'])

        # shutdown
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEqual(ret, 0, 'failed to shutdown smr')
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEqual(ret, 0, 'failed to shutdown redis')
        util.log('succeeded : shutdown pgs%d' % (server['id']))

        # delete smr_logs
        ret = util.delete_smr_logs(server['id'])
        self.assertEqual(ret, 0,
                         'failed to delete smr log, id:%d' % server['id'])
        util.log('succeeded : delete replication logs')

        time.sleep(5)

        # set value
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0, 'failed to connect to gateway')
        for i in range(50000, 100000):
            timestamp[i] = time.time()
            k = '%s_%d' % (key_base, i)
            cmd = 'set %s %f\r\n' % (k, timestamp[i])
            gw.write(cmd)
            response = gw.read_until('\r\n')
            self.assertNotEqual(response.find('+OK'), -1,
                                'failed to set key value through gateway')

        # recovery
        ret = testbase.request_to_start_smr(server)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis')
        time.sleep(5)

        ret = testbase.wait_until_finished_to_set_up_role(server)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))
        util.log('succeeded : recover pgs%d' % server['id'])

        # check value
        recovered_redis = redis_mgmt.Redis(server['id'])
        ret = recovered_redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        for i in range(0, 100000):
            k = '%s_%d' % (key_base, i)
            cmd = 'get %s\r\n' % (k)
            recovered_redis.write(cmd)
            recovered_redis.read_until('\r\n')
            response = recovered_redis.read_until('\r\n')
            self.assertEqual(response, '%f\r\n' % (timestamp[i]),
                             'inconsistent %s, %f' % (response, timestamp[i]))
    def test_4_PGS_mgen_is_less_than_PG_mgen( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # initial data
        util.put_some_data(self.cluster)

        # shutdown
        server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' )
        ret = testbase.request_to_shutdown_smr( server_to_join )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        ret = testbase.request_to_shutdown_redis( server_to_join )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server_to_join, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) )

        # set value
        key_base = 'mw'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )

        # master failover 1 (master generation + 1)
        util.log('master failover 1')
        server = util.get_server_by_role( self.cluster['servers'], 'master' )
        self.failover( server )

        # check quorum (copy:3, quorum:1, available:2)
        ok = False
        for i in xrange(10):
            ok = util.check_quorum(self.cluster['cluster_name'],
                    self.leader_cm['ip'], self.leader_cm['cm_port'])
            if ok:
                break
            else:
                time.sleep(1)
        self.assertTrue( ok, 'Check quorum fail.' )

        # master failover 2 (master generation + 1)
        util.log('master failover 2')
        server = util.get_server_by_role( self.cluster['servers'], 'master' )
        self.failover( server )

        # recovery
        util.log('master recovery start.')
        ret = testbase.request_to_start_smr( server_to_join )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server_to_join )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) )
        util.log('master recovery end successfully.')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )

        time.sleep( 5 )

        # set value
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )

        server = util.get_server_by_role( self.cluster['servers'], 'master' )

        redis = redis_mgmt.Redis( server_to_join['id'] )
        ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check value
        for i in range(0, 20000):
            cmd = 'get %s%d\r\n' % (key_base, i)
            redis.write( cmd )
            redis.read_until( '\r\n'  )
            response = redis.read_until( '\r\n'  )
            self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) )

        gw.disconnect()
        return 0
Пример #31
0
    def test_rdb_backups(self):
        util.print_frame()
        bgsave_count = 50
        org_path = os.getcwd()
        os.chdir(util.redis_dir(0))
        server0 = self.cluster['servers'][0]
        redis0 = telnetlib.Telnet(server0['ip'], server0['redis_port'])

        util.log("Starting load generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("Set the number of rdb backups = 24")
        redis0.write("config set number-of-rdb-backups 24\r\n")
        redis0.read_until("+OK\r\n")

        util.log("Clear old rdb backups\r\n")
        for f in os.listdir('.'):
            if (f.endswith('.rdb')):
                os.remove(f)

        util.log("Bgsaving continuously and counting the number of rdb backups")
        for i in range(bgsave_count):
            # Save current time before Bgsaving
            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            redis_server_time = int(ret.strip())
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            time.sleep(1.1)

            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            self.assertNotEqual(redis_server_time, int(ret.strip()))
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            util.log("%d ~ %d" % (redis_server_time, int(ret.strip())))

            # Bgsave
            redis0.write("bgsave\r\n")
            ret = redis0.read_until('\r\n', 1)
            self.assertEqual('+Background saving started\r\n', ret)

            # Wait finishing bgsave
            while True:
                redis0.write('lastsave\r\n')
                ret = redis0.read_until('\r\n', 1)
                lastsave_time = int(ret[1:].strip())
                if lastsave_time > redis_server_time: break
                time.sleep(0.1)

            # Count the number of rdb backups
            rdb_list = [name for name in os.listdir('.')
                if os.path.isfile(name)
                    and name.startswith('dump') and name.endswith('.rdb')]

            util.log(rdb_list)
            util.log("Iteration:%d, rdb Backups:%d" % (i+1, len(rdb_list)))

            self.assertTrue(i+1 > 24 and len(rdb_list) == 25 or len(rdb_list) == i+1)
            self.assertTrue('dump.rdb' in rdb_list)


        util.log("\nSet the number of rdb backups = 5")
        redis0.write("config set number-of-rdb-backups 5\r\n")
        redis0.read_until("+OK\r\n")

        for i in range(3):
            # Save current time before Bgsaving
            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            redis_server_time = int(ret.strip())
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            time.sleep(1.1)

            # Bgsave
            redis0.write("bgsave\r\n")
            ret = redis0.read_until('\r\n', 1)
            self.assertEqual('+Background saving started\r\n', ret)

            # Wait finishing bgsave
            while True:
                redis0.write('lastsave\r\n')
                ret = redis0.read_until('\r\n', 1)
                lastsave_time = int(ret[1:].strip())
                if lastsave_time > redis_server_time: break
                time.sleep(0.1)

            # Count the number of rdb backups
            rdb_list = [name for name in os.listdir('.')
                if os.path.isfile(name)
                    and name.startswith('dump') and name.endswith('.rdb')]

            util.log(rdb_list)
            util.log("Iteration:%d, rdb Backups:%d" % (i+1, len(rdb_list)))

            self.assertTrue(len(rdb_list) == 6)
            self.assertTrue('dump.rdb' in rdb_list)


        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
        os.chdir(org_path)
    def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # initial data
        util.put_some_data(self.cluster)

        master, s1, s2 = util.get_mss(self.cluster)

        server_to_join = [s1, s2]
        # shutdown slaves
        for i in range(0, 2):
            ret = testbase.request_to_shutdown_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id'])
            util.log('succeeded to shutdown smr%d' % server_to_join[i]['id'])

            ret = testbase.request_to_shutdown_redis( server_to_join[i] )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
            util.log('succeeded to shutdown redis%d' % server_to_join[i]['id'])

            # check state F
            max_try = 20
            expected = 'F'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) )

        # put more data
        util.put_some_data(self.cluster, 10, 256)

        # bgsave
        ret = util.bgsave(master)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown master
        ret = testbase.request_to_shutdown_smr( master )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        util.log('succeeded to shutdown master smr, id=%d' % master['id'])
        ret = testbase.request_to_shutdown_redis( master )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        util.log('succeeded to shutdown master redis, id=%d' % master['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (master['id'], state, expected) )

        # recovery slaves
        for i in range(0, 2):
            ret = testbase.request_to_start_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start smr' )

            ret = testbase.request_to_start_redis( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start redis' )

            ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 )
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) )

            # check state N
            max_try = 20
            expected = 'N'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            role = util.get_role_of_server( server_to_join[i] )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) )

        # set value
        s = random.choice(server_to_join)
        redis = redis_mgmt.Redis( ['id'] )
        ret = redis.connect( s['ip'], s['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        key_base = 'key_test'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        redis.disconnect()

        for i in range(0, 2):
            redis = redis_mgmt.Redis( server_to_join[i]['id'] )
            ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis' )

            # check value
            for j in range(0, 10000):
                cmd = 'get %s%d\r\n' % (key_base, j)
                redis.write( cmd )
                redis.read_until( '\r\n'  )
                response = redis.read_until( '\r\n'  )
                self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) )

        # try to recover master, but failed
        ret = testbase.request_to_start_smr( master )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( master, False )
        self.assertEqual( ret, 0, 'failed to start redis' )

        max_try = 3
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( master )
        self.assertNotEqual( expected, state,
                             'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) )
        util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.')

        gw.disconnect()
        return 0
Пример #33
0
 def start_load_generator(self, num):
     for i in range(num):
         ip, port = util.get_rand_gateway(self.cluster)
         self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
             i, ip, port)
         self.load_gen_thrd_list[i].start()
    def failure_recovery( self, role, wait_count=10, redis_only=False ):
        time.sleep( 2 )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set value
        key = 'new_key_haha'
        cmd = 'set %s 12345\r\n' % (key)
        gw.write( cmd )
        res = gw.read_until( '\r\n' )
        self.assertEquals( res, '+OK\r\n' )

        # shutdown
        server = util.get_server_by_role( self.cluster['servers'], role )

        if redis_only == False:
            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )

        # set value
        check_value = '54321'
        cmd = 'set %s %s\r\n' % (key, check_value)
        gw.write( cmd )
        res = gw.read_until( '\r\n' )
        self.assertEquals( res, '+OK\r\n' )
        gw.disconnect()

        # recovery
        if redis_only == False:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )

        # check value
        cmd = 'get %s\r\n' % (key)
        redis.write( cmd )
        redis.read_until( '\r\n'  )
        response = redis.read_until( '\r\n'  )
        self.assertEqual( response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value) )
Пример #35
0
    def elect_master_randomly(self):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway('0')
        gw.connect(ip, port)
        for i in range(0, 1000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to gw(%s:%d). cmd:%s, res:%s' %
                (ip, port, cmd[:-2], res[:-2]))

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append(server['id'])

        for try_cnt in range(30):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
            util.log('master id : %d' % m['id'])

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove(m['id'])

            smr = smr_mgmt.SMR(m['id'])
            ret = smr.connect(m['ip'], m['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (m['ip'], m['smr_mgmt_port']))
            cmd = 'role lconn\r\n'
            smr.write(cmd)
            reply = smr.read_until('\r\n')
            self.assertEqual(
                reply, '+OK\r\n',
                'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]))
            util.log('succeeded : cmd="%s", reply="%s"' %
                     (cmd[:-2], reply[:-2]))

            # wait until role-change is finished
            for role_change_try_cnt in range(5):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server(server)
                    real_role = util.roleNumberToChar(real_role)
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break
                time.sleep(1)

            # check the number of master and slave
            self.assertEqual(
                count_master, 1,
                'failed : the number of master is not 1, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            self.assertEqual(
                count_slave, 2,
                'failed : the number of slave is not 2, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            util.log(
                'succeeded : the number of master is 1 and the number of slave is 2'
            )

            # check states of all pgs in pg
            for try_cnt in range(3):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server(s)
                    real_role = util.roleNumberToChar(real_role)
                    smr_info = util.get_smr_info(s, self.leader_cm)
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log(
                            'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))
                    else:
                        util.log(
                            '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))

                if ok == False:
                    time.sleep(0.5)
                else:
                    break

            self.assertTrue(ok, 'failed : role check')

            if len(server_ids) == 0:
                util.log('succeeded : all smrs have been as a master')
                return 0

        self.assertEqual(
            0, len(server_ids), 'failed : remains server ids=[%s]' %
            (','.join('%d' % id for id in server_ids)))
        return 0
Пример #36
0
    def consistent_after_failover( self ):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( ip )
        gw.connect( ip, port )

        for i in range( 0, max ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        time.sleep( 5 )

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] )
            ret = testbase.request_to_shutdown_redis( server )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
        time.sleep( 5 )

        # check state F
        for server in servers:
            state = self.get_expected_smr_state( server, 'F' )
            self.assertEquals( 'F', state,
                               'server%d - state:%s' % (server['id'], state) )

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] )

            ret = testbase.request_to_start_redis( server, False )
            self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id']  )

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep( 5 )

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] )
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
            self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) )

            state = self.get_expected_smr_state( server, 'N' )
            role = util.get_role_of_server( server )
            self.assertEquals( 'N', state,
                               'server%d - state:%s, role:%s' % (server['id'], state, role) )

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server( server )
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave,
                           'failed to set roles, the number of master:%d, the number of slave:%d' %
                           (the_number_of_master, the_number_of_slave) )

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis( master['id'] )
        ret = redis.connect( master['ip'], master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] )

        for i in range( max, max*2 ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n',
                               'failed to get response, server:%d' % master['id'] )
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis( slave['id'] )
            ret = slave_redis .connect( slave['ip'], slave['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] )

            for i in range( 0, max*2 ):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write( cmd )
                trash = slave_redis.read_until( '\r\n' )
                res = slave_redis.read_until( '\r\n' )
                self.assertEquals( res, '%d\r\n' % i,
                                   'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)  )
            slave_redis.disconnect()
Пример #37
0
    def test_delete_smrlog_after_scaleout(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
                i, ip, port)
            self.load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        util.log("started load_generator")

        # servers for scale out
        servers = [config.server4, config.server5, config.server6]
        leader_cm = self.cluster['servers'][0]

        # Scale out
        cluster = config.clusters[0]
        ret = util.pg_add(cluster, servers, leader_cm)
        self.assertEqual(True, ret,
                         'Scale out fail. util.pg_add returns false')

        time.sleep(5)
        # pg0 -> pg1
        cluster = config.clusters[1]
        ret = util.migration(cluster, 0, 1, 8000, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail 0 -> 1')

        # get log file
        old_logs = {}
        for s in config.clusters[0]['servers']:
            parent_dir, log_dir = util.smr_log_dir(s['id'])
            path = '%s/%s' % (parent_dir, log_dir)
            old_logs[s['id']] = util.ls(path)

        # bgsave in order to make smrlogs deleted.
        for s in config.clusters[0]['servers']:
            bgsave_ret = util.bgsave(s)
            self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % s['id'])
            util.log('bgsave pgs%d is done.')

        # check consistency
        ok = True
        for j in range(len(self.load_gen_thrd_list)):
            self.assertTrue(self.load_gen_thrd_list[j].isConsistent(),
                            'Inconsistent after migration')

        # is smr-replicator delete smrlogs?
        i = 0
        while i < 20:
            i += 1
            # get current log files
            cur_logs = {}
            for s in config.clusters[0]['servers']:
                parent_dir, log_dir = util.smr_log_dir(s['id'])
                path = '%s/%s' % (parent_dir, log_dir)
                cur_logs[s['id']] = util.ls(path)

            # compare old and new
            temp_old_logs = copy.deepcopy(old_logs)
            for id, nl in cur_logs.items():
                ol = temp_old_logs.get(id)
                self.assertNotEqual(
                    ol, None,
                    "failed to check logfiles. old logs for smr-replicator '%d' is not exist."
                    % id)

                for log in nl:
                    if log in ol:
                        ol.remove(log)

            ok = True
            for id, ol in temp_old_logs.items():
                if len(ol) == 0:
                    ok = False

            util.log(
                'Loop %d ---------------------------------------------------------'
                % i)
            util.log('deleted smrlog files: %s' %
                     util.json_to_str(temp_old_logs))

            if ok:
                break

            time.sleep(10)

        self.assertTrue(ok, 'smr-replicator does not delete smrlogs.')
        util.log('smr-replicator deletes smrlogs.')

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')
Пример #38
0
    def elect_master_randomly( self ):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway( '0' )
        gw.connect( ip, port )
        for i in range( 0, 1000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) )

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append( server['id'] )

        for try_cnt in range( 30 ):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
            util.log( 'master id : %d' % m['id'] )

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove( m['id'] )

            smr = smr_mgmt.SMR( m['id'] )
            ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
            self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
            cmd = 'role lconn\r\n'
            smr.write( cmd )
            reply = smr.read_until( '\r\n' )
            self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )
            util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )

            # wait until role-change is finished
            for role_change_try_cnt in range( 5 ):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server( server )
                    real_role = util.roleNumberToChar( real_role )
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break;
                time.sleep( 1 )

            # check the number of master and slave
            self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            util.log( 'succeeded : the number of master is 1 and the number of slave is 2' )

            # check states of all pgs in pg
            for try_cnt in range( 3 ):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server( s )
                    real_role = util.roleNumberToChar( real_role )
                    smr_info = util.get_smr_info( s, self.leader_cm )
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )
                    else:
                        util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )

                if ok == False:
                    time.sleep( 0.5 )
                else:
                    break

            self.assertTrue( ok, 'failed : role check' )

            if len( server_ids ) == 0:
                util.log( 'succeeded : all smrs have been as a master' )
                return 0

        self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))  )
        return 0
Пример #39
0
    def test_moving_pgs(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("started load_generator")

        servers = self.cluster['servers']

        gw_list = []
        for server in servers:
            gw = {}
            gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port']+1)
            gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port'])
            gw_list.append(gw)

        n = 0
        step = 0
        iter = 30
        while iter > 0:
            if n == 0 or random.randint(0, 1) == 0:
                step = random.randint(1, 10)
            else:
                step = -1 * random.randint(1, n)

            print "<<< ITER = %d, PG%d -> PG%d, PG%d -> PG%d >>>" % (iter, n*2, (n+step)*2, n*2+1, (n+step)*2+1)
            gw = gw_list[0]
            self.pgs_del_server(gw['mgmt'], servers[0], n)
            self.pgs_del_server(gw['mgmt'], servers[1], n)
            self.pgs_del_server(gw['mgmt'], servers[5], n)

            gw['mgmt'].write("pg_add %d\r\n" % ((n+step)*2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("pg_add %d\r\n" % ((n+step)*2+1))
            gw['mgmt'].read_until("+OK\r\n")

            self.pgs_add_server(gw['mgmt'], servers[0], n+step)
            self.pgs_add_server(gw['mgmt'], servers[1], n+step)
            self.pgs_add_server(gw['mgmt'], servers[5], n+step)

            while True:
                gw['normal'].write("info gateway\r\n")
                ret = gw['normal'].read_until("\r\n", 1)
                if "-ERR" in ret:
                    continue
                ret = gw['normal'].read_until("\r\n\r\n", 1)
                #print ret
                if "gateway_disconnected_redis:0\r\n" in ret:
                    break

            gw['mgmt'].write("delay 0 4095\r\n")
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("delay 4096 8191\r\n")
            gw['mgmt'].read_until("+OK\r\n")

            gw['mgmt'].write("redirect 0 4095 %d\r\n" % ((n+step)*2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("redirect 4096 8191 %d\r\n" % ((n+step)*2+1))
            gw['mgmt'].read_until("+OK\r\n")

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")

            self.pgs_del_server(gw['mgmt'], servers[2], n)
            self.pgs_del_server(gw['mgmt'], servers[3], n)
            self.pgs_del_server(gw['mgmt'], servers[4], n)

            self.pgs_add_server(gw['mgmt'], servers[2], n+step)
            self.pgs_add_server(gw['mgmt'], servers[3], n+step)
            self.pgs_add_server(gw['mgmt'], servers[4], n+step)

            while True:
                gw['normal'].write("info gateway\r\n")
                ret = gw['normal'].read_until("\r\n", 1)
                if "-ERR" in ret:
                    continue
                ret = gw['normal'].read_until("\r\n\r\n", 1)
                #print ret
                if "gateway_disconnected_redis:0\r\n" in ret:
                    break

            gw['mgmt'].write("pg_del %d\r\n" % (n*2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("pg_del %d\r\n" % (n*2+1))
            gw['mgmt'].read_until("+OK\r\n")

            n += step

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")
            iter -= 1

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
Пример #40
0
    def state_transition( self ):
        server = util.get_server_by_role( self.cluster['servers'], 'slave' )
        self.assertNotEquals( server, None, 'failed to get_server_by_role-slave' )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )

        # check initial state
        state = self.get_expected_smr_state( server, 'N' )
        role = util.get_role_of_server( server )
        self.assertEquals( 'N', state,
                           'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role) )

        # shutdown
        ret = testbase.request_to_shutdown_smr( server )
        self.assertEquals( ret, 0, 'failed to shutdown smr' )
        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        time.sleep( 3 )


        # check state F
        expected = 'F'
        state = self.get_expected_smr_state( server, expected )
        self.assertEquals( expected , state,
                           'server%d - state:%s, but expected:%s' % (server['id'], state, expected) )

        # set value
        ret = gw.connect( ip, port )
        self.assertEquals( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )
        timestamp  = 0.0
        for i in range( 0, 100 ):
            timestamp = time.time()
            key = 'new_key_haha'
            cmd = 'set %s %f\r\n' % (key, timestamp)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        gw.disconnect()

        # recovery
        ret = testbase.request_to_start_smr( server )
        self.assertEquals( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server )
        self.assertEquals( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server, 10 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )
        time.sleep( 5 )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        expected = 'N'
        max_try = 20
        for i in range( 0, max_try ):
            state = self.get_expected_smr_state( server, expected )
            if state == expected:
                break
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected) )
Пример #41
0
    def test_scaleout(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_thrd_list[i].start()

        time.sleep(5) # generate load for 5 sec
        util.log("started load_generator")

        # servers for scale out
        servers = [config.server4, config.server5, config.server6]
        leader_cm = self.cluster['servers'][0]

        # start migration
        migration_count = 5
        for i in range(migration_count):
            # Scale out
            cluster = config.clusters[0]
            ret = util.pg_add(cluster, servers, leader_cm)
            self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false')

            time.sleep(5)
            # pg0 -> pg1
            cluster = config.clusters[1]
            ret = util.migration(cluster, 0, 1, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail 0 -> 1')

            # pg0 <- pg1
            cluster = config.clusters[1]
            ret = util.migration(cluster, 1, 0, 4096, 8191, 40000)
            self.assertEqual(True, ret, 'Migration Fail 1 <- 0')

            # Scale in

            #TODO Temporary
            #cluster = config.clusters[0]
            #for server in cluster['servers']:
            #    if testbase.request_to_shutdown_hbc(server) is not 0:
            #        util.log('scale in : failed to request to shutdown hbc')
            #        self.assertFalse('scale in : failed to request to shutdown hbc')
            #time.sleep(5)
            ###############

            cluster = config.clusters[1]
            ret = util.pg_del(cluster, servers, leader_cm)
            self.assertEqual(True, ret, 'Scale in fail. util.pg_del returns false')

            #TODO Temporary
            #cluster = config.clusters[0]
            #for server in cluster['servers']:
            #    if testbase.request_to_start_heartbeat_checker( server ) is not 0:
            #        util.log('scale in : failed to start hbc')
            #        self.assertFalse('scale in : failed to start hbc')
            #time.sleep(5)
            ###############

            # check consistency
            ok = True
            for j in range(len(self.load_gen_thrd_list)):
                if self.load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break;

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
Пример #42
0
    def test_random_migrate(self):
        util.print_frame()

        # start load generator
        load_gen_thrd_list = {}
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')

        leader_cm = self.cluster['servers'][0]
        cluster_name = self.cluster['cluster_name']
        mapping = [-1] * 8192

        count = 50
        while count > 0:
            # get PN -> PG map
            cmd = 'cluster_info %s' % cluster_name
            result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'],
                                     cmd)
            ret = json.loads(result)
            rle = ret['data']['cluster_info']['PN_PG_Map']
            print "PN_PG_MAP = %s" % rle
            sp = rle.split()
            index = 0
            for i in range(len(sp) / 2):
                for j in range(int(sp[i * 2 + 1])):
                    mapping[index] = int(sp[i * 2])
                    index += 1

            slot = random.randint(0, 8191)
            src_pgid = mapping[slot]
            dst_pgid = (src_pgid + 1) % 2
            slot_end = slot
            while random.randint(0, 5) <= 4:
                if slot_end < 8191 and mapping[slot_end + 1] == src_pgid:
                    slot_end += 1
                else:
                    break

            print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid,
                                                         dst_pgid)
            ret = util.migration(self.cluster, src_pgid, dst_pgid, slot,
                                 slot_end, 40000)
            self.assertEqual(True, ret, 'Migration Fail')

            ok = True
            for j in range(len(load_gen_thrd_list)):
                if load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break

            count -= 1

        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')

        # Go back to initial configuration
        cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'],
                                  cluster_name)
        for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1):
            self.assertTrue(
                util.migration(self.cluster, 1, 0, slot['begin'], slot['end'],
                               40000), 'failed to rollback migration')
Пример #43
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        for i in range(30):
            target_server = util.get_server_by_role(self.cluster['servers'],
                                                    'slave')
            self.assertNotEquals(target_server, None, 'Get slave fail.')
            target = target_server['id']

            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    try:
                        pong = util.pingpong(s['ip'], s['redis_port'])
                        if pong != None and pong == '+PONG\r\n':
                            ok = True
                            break
                    except:
                        pass
                    time.sleep(0.2)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]

        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            s = util.get_server_by_role(servers, 'slave')
            target = s['id']

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Go back to initial configuration
        self.assertTrue(
            util.install_pgs(self.cluster,
                             self.cluster['servers'][0],
                             self.leader_cm,
                             rm_ckpt=False), 'failed to recover pgs.')
Пример #44
0
    def test_migration_with_expire_command(self):
        util.print_frame()

        util.log("start load_generator")
        load_gen_thrd_list = {}
        for i in range(1):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        tps = 20000
        src_pg_id = 0
        dst_pg_id = 1
        leader_cm = self.cluster['servers'][0]
        src_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', src_pg_id)
        dst_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', dst_pg_id)

        smr = smr_mgmt.SMR(src_master['id'])
        ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port'])
        if ret != 0:
            util.log('failed to connect to smr(source master)')
            return False

        src_redis = redis_mgmt.Redis(src_master['id'])
        ret = src_redis.connect(src_master['ip'], src_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:persist', 20)

        self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        util.log(">>> migrate test with expire command start(%s), ts:%d" %
                 (time.asctime(), ts))

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:persist', 20)

        # notify dst_redis of migration start
        util.log(">>> notify dst_redis of migration start (%s)" %
                 time.asctime())

        cmd = 'migconf migstart %d-%d\r\n' % (0, 8191)
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        # remote partial checkpoint
        util.log(">>> start remote checkpoint and load (%s)" % time.asctime())
        cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], dst_master['ip'],
            dst_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)

        ret = p.wait()
        for line in p.stdout:
            if line.find("Checkpoint Sequence Number:") != -1:
                util.log("seqnumber : " + line[line.rfind(":") + 1:])
                seq = int(line[line.rfind(":") + 1:])
            util.log(">>>" + str(line.rstrip()))

        self.assertEqual(0, ret)
        util.log(">>> end remote checkpoint and load (%s)" % time.asctime())

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        # bgsave for testing later about recovery during migration
        util.log(
            ">>> bgsave for testing later about recovery during migration (%s)"
            % time.asctime())
        cmd = 'bgsave\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+Background saving started\r\n')

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist',
                          20)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist',
                          100)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:persist', 100)

        # remote catchup (smr log migration)
        util.log(">>> start remote catchup (%s)" % time.asctime())

        dst_host = dst_master['ip']
        dst_smr_port = dst_master['smr_base_port']
        rle = '1 8192'
        num_part = 8192

        smr.write('migrate start %s %d %d %d %d %s\r\n' %
                  (dst_host, dst_smr_port, seq, tps, num_part, rle))
        response = smr.read_until('\r\n')
        if response[:3] != '+OK':
            util.log('failed to execute migrate start command, response:%s' %
                     response)
            return False

        while True:
            smr.write('migrate info\r\n')
            response = smr.read_until('\r\n')
            seqs = response.split()
            logseq = int(seqs[1].split(':')[1])
            mig = int(seqs[2].split(':')[1])
            util.log('migrate info: %s' % response)
            if (logseq - mig < 500000):
                util.log('Remote catchup almost done. try mig2pc')
                break
            time.sleep(1)

        util.log(">>> sleep until 90 sec pass")
        self.assertFalse(time.time() - ts >= 90)
        time.sleep(90 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired',
                            10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist',
                            20)

        util.log(">>> remote catchup phase almost done (%s)" % time.asctime())

        # mig2pc
        util.log(">>> start mig2pc (%s)" % time.asctime())

        cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'],
                                         src_pg_id, dst_pg_id, 0, 8191)
        result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
        util.log('mig2pc result : ' + result)
        if not result.startswith('{"state":"success","msg":"+OK"}\r\n'):
            util.log('failed to execute mig2pc command, result:%s' % result)
            return False

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10)
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20)

        # finish migration
        smr.write('migrate interrupt\r\n')
        response = smr.read_until('\r\n')
        util.log('migrate interrupt: %s' % response)
        smr.disconnect()

        # notify dst_redis of migration end
        util.log(">>> notify dst_redis of migration end (%s)" % time.asctime())

        cmd = 'migconf migend\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191)
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        # remote partial checkpoint
        util.log(">>> start rangedel (%s)" % time.asctime())
        cmd = "./cluster-util --rangedel %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)
        ret = p.wait()

        for line in p.stdout:
            util.log(">>>" + str(line.rstrip()))

        cmd = 'migconf clearend\r\n'
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEqual(res, '+OK\r\n')

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')

        # kill dst_redis and recover from bgsave
        util.log(">>> kill dst_redis and recover from bgsave (%s)" %
                 time.asctime())

        dst_redis.disconnect()
        ret = testbase.request_to_shutdown_redis(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_master['id'])

        ret = testbase.request_to_start_redis(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_master['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_master)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_master['id']))

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis, 'S3:PermanentKey')

        # kill dst_slave redis and recover without dump file
        util.log(">>> kill dst_redis and recover without dump file (%s)" %
                 time.asctime())
        dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                   'slave', dst_pg_id)

        ret = testbase.request_to_shutdown_redis(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_slave['id'])

        ret = testbase.request_to_start_redis(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_slave['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_slave)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_slave['id']))

        dst_redis_slave = redis_mgmt.Redis(dst_slave['id'])
        ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis_slave, 'S3:PermanentKey')

        # Go back to initial configuration
        self.assertTrue(
            util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000),
            'failed to rollback migration')
Пример #45
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr(target)
            self.assertEqual(ret, 0, 'failed to shutdown smr')

            ret = testbase.request_to_shutdown_redis(target)
            self.assertEquals(ret, 0, 'failed to shutdown redis')

            r = ''
            expected = 'N'
            for fc_cnt in xrange(20):
                r = util.get_smr_role_of_cm(target, self.leader_cm)
                if r == expected:
                    break
                time.sleep(0.5)
            self.assertEquals(r, expected, 'failure detection error.')

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm,
                                         self.cluster['cluster_name'],
                                         s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(
                    old_ts, new_ts,
                    'Timestamp of a running server has not changed. %d->%d' %
                    (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok, 'unexpected quorum(after role change). expected:%s' %
                (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr(target)
            self.assertEqual(ret, 0, 'failed to start smr')
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis(target, 60)
            self.assertEqual(ret, 0, 'failed to start redis')
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role(target,
                                                              max_try=300)
            self.assertEquals(
                ret, 0, 'failed to role change. smr_id:%d' % (target['id']))

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check cluster state
            normal_state = False
            for i in xrange(20):
                normal_state = util.check_cluster(self.cluster['cluster_name'],
                                                  self.leader_cm['ip'],
                                                  self.leader_cm['cm_port'],
                                                  check_quorum=True)
                if normal_state:
                    break
                time.sleep(0.5)
            self.assertTrue(normal_state, "Unstable cluster state")

            # Check quorum
            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok,
                'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(),
                                'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Пример #46
0
    def test_moving_pgs(self):
        util.print_frame()

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
                i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("started load_generator")

        servers = self.cluster['servers']

        gw_list = []
        for server in servers:
            gw = {}
            gw['mgmt'] = telnetlib.Telnet(server['ip'],
                                          server['gateway_port'] + 1)
            gw['normal'] = telnetlib.Telnet(server['ip'],
                                            server['gateway_port'])
            gw_list.append(gw)

        n = 0
        step = 0
        iter = 30
        while iter > 0:
            if n == 0 or random.randint(0, 1) == 0:
                step = random.randint(1, 10)
            else:
                step = -1 * random.randint(1, n)

            print "<<< ITER = %d, PG%d -> PG%d, PG%d -> PG%d >>>" % (
                iter, n * 2, (n + step) * 2, n * 2 + 1, (n + step) * 2 + 1)
            gw = gw_list[0]
            self.pgs_del_server(gw['mgmt'], servers[0], n)
            self.pgs_del_server(gw['mgmt'], servers[1], n)
            self.pgs_del_server(gw['mgmt'], servers[5], n)

            gw['mgmt'].write("pg_add %d\r\n" % ((n + step) * 2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("pg_add %d\r\n" % ((n + step) * 2 + 1))
            gw['mgmt'].read_until("+OK\r\n")

            self.pgs_add_server(gw['mgmt'], servers[0], n + step)
            self.pgs_add_server(gw['mgmt'], servers[1], n + step)
            self.pgs_add_server(gw['mgmt'], servers[5], n + step)

            while True:
                gw['normal'].write("info gateway\r\n")
                ret = gw['normal'].read_until("\r\n", 1)
                if "-ERR" in ret:
                    continue
                ret = gw['normal'].read_until("\r\n\r\n", 1)
                #print ret
                if "gateway_disconnected_redis:0\r\n" in ret:
                    break

            gw['mgmt'].write("delay 0 4095\r\n")
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("delay 4096 8191\r\n")
            gw['mgmt'].read_until("+OK\r\n")

            gw['mgmt'].write("redirect 0 4095 %d\r\n" % ((n + step) * 2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("redirect 4096 8191 %d\r\n" %
                             ((n + step) * 2 + 1))
            gw['mgmt'].read_until("+OK\r\n")

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")

            self.pgs_del_server(gw['mgmt'], servers[2], n)
            self.pgs_del_server(gw['mgmt'], servers[3], n)
            self.pgs_del_server(gw['mgmt'], servers[4], n)

            self.pgs_add_server(gw['mgmt'], servers[2], n + step)
            self.pgs_add_server(gw['mgmt'], servers[3], n + step)
            self.pgs_add_server(gw['mgmt'], servers[4], n + step)

            while True:
                gw['normal'].write("info gateway\r\n")
                ret = gw['normal'].read_until("\r\n", 1)
                if "-ERR" in ret:
                    continue
                ret = gw['normal'].read_until("\r\n\r\n", 1)
                #print ret
                if "gateway_disconnected_redis:0\r\n" in ret:
                    break

            gw['mgmt'].write("pg_del %d\r\n" % (n * 2))
            gw['mgmt'].read_until("+OK\r\n")
            gw['mgmt'].write("pg_del %d\r\n" % (n * 2 + 1))
            gw['mgmt'].read_until("+OK\r\n")

            n += step

            gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n")
            print gw_list[0]['mgmt'].read_until("+PONG\r\n")
            iter -= 1

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after gateway_mgmt test')
Пример #47
0
    def test_rdb_backups(self):
        util.print_frame()
        bgsave_count = 50
        org_path = os.getcwd()
        os.chdir(util.redis_dir(0))
        server0 = self.cluster['servers'][0]
        redis0 = telnetlib.Telnet(server0['ip'], server0['redis_port'])

        util.log("Starting load generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_thrd_list[i] = load_generator.LoadGenerator(
                i, ip, port)
            self.load_gen_thrd_list[i].start()

        util.log("Set the number of rdb backups = 24")
        redis0.write("config set number-of-rdb-backups 24\r\n")
        redis0.read_until("+OK\r\n")

        util.log("Clear old rdb backups\r\n")
        for f in os.listdir('.'):
            if (f.endswith('.rdb')):
                os.remove(f)

        util.log(
            "Bgsaving continuously and counting the number of rdb backups")
        for i in range(bgsave_count):
            # Save current time before Bgsaving
            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            redis_server_time = int(ret.strip())
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            time.sleep(1.1)

            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            self.assertNotEqual(redis_server_time, int(ret.strip()))
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            util.log("%d ~ %d" % (redis_server_time, int(ret.strip())))

            # Bgsave
            redis0.write("bgsave\r\n")
            ret = redis0.read_until('\r\n', 1)
            self.assertEqual('+Background saving started\r\n', ret)

            # Wait finishing bgsave
            while True:
                redis0.write('lastsave\r\n')
                ret = redis0.read_until('\r\n', 1)
                lastsave_time = int(ret[1:].strip())
                if lastsave_time > redis_server_time: break
                time.sleep(0.1)

            # Count the number of rdb backups
            rdb_list = [
                name for name in os.listdir('.') if os.path.isfile(name)
                and name.startswith('dump') and name.endswith('.rdb')
            ]

            util.log(rdb_list)
            util.log("Iteration:%d, rdb Backups:%d" % (i + 1, len(rdb_list)))

            self.assertTrue(i + 1 > 24 and len(rdb_list) == 25
                            or len(rdb_list) == i + 1)
            self.assertTrue('dump.rdb' in rdb_list)

        util.log("\nSet the number of rdb backups = 5")
        redis0.write("config set number-of-rdb-backups 5\r\n")
        redis0.read_until("+OK\r\n")

        for i in range(3):
            # Save current time before Bgsaving
            redis0.write('time\r\n')
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)
            ret = redis0.read_until('\r\n', 1)
            redis_server_time = int(ret.strip())
            redis0.read_until('\r\n', 1)
            redis0.read_until('\r\n', 1)

            time.sleep(1.1)

            # Bgsave
            redis0.write("bgsave\r\n")
            ret = redis0.read_until('\r\n', 1)
            self.assertEqual('+Background saving started\r\n', ret)

            # Wait finishing bgsave
            while True:
                redis0.write('lastsave\r\n')
                ret = redis0.read_until('\r\n', 1)
                lastsave_time = int(ret[1:].strip())
                if lastsave_time > redis_server_time: break
                time.sleep(0.1)

            # Count the number of rdb backups
            rdb_list = [
                name for name in os.listdir('.') if os.path.isfile(name)
                and name.startswith('dump') and name.endswith('.rdb')
            ]

            util.log(rdb_list)
            util.log("Iteration:%d, rdb Backups:%d" % (i + 1, len(rdb_list)))

            self.assertTrue(len(rdb_list) == 6)
            self.assertTrue('dump.rdb' in rdb_list)

        # check consistency of load_generator
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].quit()
        for i in range(len(self.load_gen_thrd_list)):
            self.load_gen_thrd_list[i].join()
            self.assertTrue(self.load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after gateway_mgmt test')
        os.chdir(org_path)
Пример #48
0
    def test_two_slaves_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs( s1 )
        self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) )

        ts_before2 = util.get_timestamp_of_pgs( s2 )
        self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) )

        # hang
        smr1 = smr_mgmt.SMR( s1['id'] )
        ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr2 = smr_mgmt.SMR( s2['id'] )
        ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr1.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr1.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr2.write( 'fi delay sleep 1 8000\r\n' )
        time.sleep( 7 )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s1 )
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s2 )
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #49
0
    def test_random_migrate(self):
        util.print_frame()

        # start load generator
        load_gen_thrd_list = {}
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000)
        self.assertEqual(True, ret, 'Migration Fail')

        leader_cm = self.cluster['servers'][0]
        cluster_name = self.cluster['cluster_name']
        mapping = [-1] * 8192

        count = 50
        while count > 0:
            # get PN -> PG map
            cmd = 'cluster_info %s' % cluster_name
            result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
            ret = json.loads(result)
            rle = ret['data']['cluster_info']['PN_PG_Map']
            print "PN_PG_MAP = %s" % rle
            sp = rle.split()
            index = 0
            for i in range(len(sp)/2):
                for j in range(int(sp[i*2+1])):
                    mapping[index] = int(sp[i*2])
                    index += 1

            slot = random.randint(0, 8191)
            src_pgid = mapping[slot]
            dst_pgid = (src_pgid+1) % 2
            slot_end = slot
            while random.randint(0,5) <= 4:
                if slot_end < 8191 and mapping[slot_end+1] == src_pgid:
                    slot_end += 1
                else:
                    break

            print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid, dst_pgid)
            ret = util.migration(self.cluster, src_pgid, dst_pgid, slot, slot_end, 40000)
            self.assertEqual(True, ret, 'Migration Fail')

            ok = True
            for j in range(len(load_gen_thrd_list)):
                if load_gen_thrd_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break;

            count -= 1;

        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')

        # Go back to initial configuration
        cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'], cluster_name)
        for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1):
            self.assertTrue(util.migration(self.cluster, 1, 0, slot['begin'], slot['end'], 40000),
                    'failed to rollback migration')
Пример #50
0
    def master_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 5 )

        # wait for forced master election
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server( s2 )
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep( 1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        self.assertEqual( success, True, 'failed to forced master election' )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check if the haning server recovered and joined as a slave
        time.sleep( 7 )
        role = util.get_role_of_server( m )
        self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #51
0
    def master_and_slave_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr_master.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave.write( 'fi delay sleep 1 10000\r\n' )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        time.sleep( 5 )

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range( 15 ):
                state = []
                util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep( 1 )

            util.log( '' )
            util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' )
            util.log( '' )
            util.log_server_state( self.cluster )

            self.assertEqual( success, True, 'failed to check copy-quorum' )

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue( ok, 'Cluster state is not normal!' )

            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write( cmd )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) )

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write( cmd )
                res = redis0.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Пример #52
0
 def start_load_generator(self, num):
     for i in range(num):
         ip, port = util.get_rand_gateway(self.cluster)
         self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
         self.load_gen_thrd_list[i].start()
Пример #53
0
    def test_all_pgs_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave1 = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )
        smr_slave2 = smr_mgmt.SMR( s2['id'] )
        ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        smr_master.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave1.write( 'fi delay sleep 1 8000\r\n' )
        smr_slave2.write( 'fi delay sleep 1 8000\r\n' )

        time.sleep( 10 )

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # set values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0 .write( cmd )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check new values (s2)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) )

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Пример #54
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr( target )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

            ret = testbase.request_to_shutdown_redis( target )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps= {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr( target )
            self.assertEqual( ret, 0, 'failed to start smr' )
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis( target, 60 )
            self.assertEqual( ret, 0, 'failed to start redis' )
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300)
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) )

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check quorum
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Пример #55
0
    def slave_failover_while_hang( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        self.failover_while_hang( s1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )
            util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')
        return 0
Пример #56
0
    def test_5_transfer_pgs_to_another_machine(self):
        util.print_frame()

        self.load_gen_list = {}

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # incrase master generation number
        util.log('failover in order to increase master generation number.')
        max = 0
        for i in range(5):
            key_base = 'key'
            for i in range(max, max+10000):
                cmd = 'set %s%d %d\r\n' % (key_base, i, i)
                gw.write( cmd )
                res = gw.read_until( '\r\n' )
                self.assertEquals( res, '+OK\r\n' )
            max = max + 10000

            m = util.get_server_by_role(self.cluster['servers'], 'master')
            util.log('failover pgs%d' %  m['id'])
            ret = util.failover(m, self.leader_cm)
            self.assertTrue(ret, 'failed to failover pgs%d' % m['id'])

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_list[i].start()

        time.sleep(5) # generate load for 5 sec
        util.log("started load_generator")

        m, s1, s2 = util.get_mss(self.cluster)
        servers = [m, s1, s2]

        # bgsave
        for s in servers:
            ret = util.bgsave(s)
            self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id'])

        new_servers = [config.server4, config.server5]

        # add new slaves
        for s in new_servers:
            util.log('delete pgs%d`s check point.' % s['id'])
            util.del_dumprdb(s['id'])

            ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191)
            self.assertEqual(True, ret,
                'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % (
                m['ip'], m['redis_port'], s['id']))

            ret = util.pgs_add(self.cluster, s, self.leader_cm, 0, rm_ckpt=False)
            self.assertEqual(True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id'])
            util.log('succeeeded : add a new slave, pgsid=%d' % s['id'])

            # check consistency
            ok = True
            for j in range(self.max_load_generator):
                if self.load_gen_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break;

        for server_to_del in servers:
            for s in servers:
                util.pingpong( s['ip'], s['smr_mgmt_port'] )
            for s in new_servers:
                util.pingpong( s['ip'], s['smr_mgmt_port'] )
            self.__del_server(server_to_del)
            util.log('succeeded : delete pgs%d' % server_to_del['id'])

        new_m = util.get_server_by_role(new_servers, 'master')
        new_s = util.get_server_by_role(new_servers, 'slave')
        self.assertNotEqual( new_m, None, 'master is None.' )
        self.assertNotEqual( new_s, None, 'slave is None.' )

        for s in new_servers:
            util.pingpong( s['ip'], s['smr_mgmt_port'] )

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Пример #57
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        target_server = util.get_server_by_role(self.cluster['servers'], 'slave')
        self.assertNotEquals(target_server, None, 'Get slave fail.')
        target = target_server['id']
        for i in range(30):
            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]
        s = util.get_server_by_role(servers, 'slave')
        target = s['id']
        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')
Пример #58
0
    def pgs_add_and_del(self, upgrade_server, type):
        util.print_frame()

        util.log('[start] add and del pgs%d. type:%s' %
                 (upgrade_server['id'], type))
        util.log_server_state(self.cluster)

        # start load generator
        load_gen_list = {}
        for i in range(len(self.cluster['servers'])):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'],
                                                    server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'],
                                       upgrade_server['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        # set new values
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway('0')
        gw.connect(ip, port)
        for i in range(0, 50):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to gw(%s:%d). cmd:%s, res:%s' %
                (ip, port, cmd[:-2], res[:-2]))

        # attach pgs from cluster
        cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'],
                                      upgrade_server['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(jobj['msg'], '+OK',
                         'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        time.sleep(3)

        # check new values
        redis = redis_mgmt.Redis(upgrade_server['id'])
        ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port'])
        self.assertEquals(
            ret, 0, 'failed : connect to smr%d(%s:%d)' %
            (upgrade_server['id'], upgrade_server['ip'],
             upgrade_server['redis_port']))

        for i in range(0, 50):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis.write(cmd)
            redis.read_until('\r\n')
            res = redis.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis%d. %s != %d' %
                (upgrade_server['id'], res, i))
        util.log('succeeded : check values with get operations on pgs%d.' %
                 (upgrade_server['id']))

        # shutdown load generators
        for i in range(len(load_gen_list)):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        util.log_server_state(self.cluster)

        return 0
Пример #59
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1, 'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 1
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state( self.cluster )

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Get new timestamp
        new_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps[s['id']] = ts

        # Compare old timestamps and new timestamps
        for s in self.cluster['servers']:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps[s['id']]
            if master in hanging_servers and len(running_servers) != 0:
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts))
            else:
                self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Пример #60
0
    def failure_recovery(self, role, wait_count=10, redis_only=False):
        time.sleep(2)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set value
        key = 'new_key_haha'
        cmd = 'set %s 12345\r\n' % (key)
        gw.write(cmd)
        res = gw.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        # shutdown
        server = util.get_server_by_role(self.cluster['servers'], role)

        if redis_only == False:
            ret = testbase.request_to_shutdown_smr(server)
            self.assertEqual(ret, 0, 'failed to shutdown smr')

        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0, 'failed to shutdown redis')

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))

        # set value
        check_value = '54321'
        cmd = 'set %s %s\r\n' % (key, check_value)
        gw.write(cmd)
        res = gw.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # recovery
        if redis_only == False:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server, wait_count)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s, role:%s' %
            (server['id'], state, expected, role))

        # check value
        cmd = 'get %s\r\n' % (key)
        redis.write(cmd)
        redis.read_until('\r\n')
        response = redis.read_until('\r\n')
        self.assertEqual(response, '%s\r\n' % (check_value),
                         'inconsistent %s, %s' % (response, check_value))