Пример #1
0
    def test_fd_leak(self):
        util.print_frame()

        server = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', 0)
        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')
        smr = smr_mgmt.SMR(server['id'])
        ret = smr.connect(server['ip'], server['smr_mgmt_port'])
        self.assertEquals(ret, 0, 'failed to connect to smr')

        redis.write('info server\r\n')
        res = redis.read_until('process_id:')
        res = redis.read_until('\r\n')
        redis.write('quit\r\n')

        pid = copy.copy(res[:-2])
        num1 = self.numOpenFds(pid)
        print "Initial : Open Fds: %s" % self.numOpenFds(pid)

        smr.write('fi delay sleep 1 1000000\r\n')
        smr.read_until('\r\n')

        for i in range(5):
            ret = redis.connect(server['ip'], server['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')
            redis.write('ping\r\n')
            res = redis.read_until('\r\n', 1)
            print "Try Ping : Open Fds: %s" % self.numOpenFds(pid)
            redis.disconnect()
            print "Disconnect : Open Fds: %s" % self.numOpenFds(pid)

            ret = redis.connect(server['ip'], server['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')
            redis.write('*1\r\nasdf\r\n')
            time.sleep(1)
            res = redis.read_until('\r\n', 1)
            print "Protocol Error : Open Fds: %s" % self.numOpenFds(pid)
            redis.disconnect()
            print "Disconnect : Open Fds: %s" % self.numOpenFds(pid)

        print "End : Open Fds: %s" % self.numOpenFds(pid)

        num2 = self.numOpenFds(pid)
        self.assertEquals(num1, num2)

        # Go back to initial configuration
        self.assertTrue(util.shutdown_pgs(server, self.cluster['servers'][0]),
                'recover pgs fail. (shutdown_pgs)')
        self.assertTrue(util.recover_pgs(server, self.cluster['servers'][0]),
                'recover pgs fail. (shutdown_pgs)')
Пример #2
0
    def test_migration_with_expire_command(self):
        util.print_frame()

        util.log("start load_generator")
        load_gen_thrd_list = {}
        for i in range(1):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5) # generate load for 5 sec
        tps = 20000
        src_pg_id = 0
        dst_pg_id = 1
        leader_cm = self.cluster['servers'][0]
        src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id)
        dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id)

        smr = smr_mgmt.SMR(src_master['id'])
        ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port'])
        if ret != 0:
            util.log('failed to connect to smr(source master)')
            return False

        src_redis = redis_mgmt.Redis(src_master['id'])
        ret = src_redis.connect(src_master['ip'], src_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        ts = time.time()
        self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20)

        self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts))

        ts = time.time()
        self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20)

        # notify dst_redis of migration start
        util.log(">>> notify dst_redis of migration start (%s)" % time.asctime())

        cmd = 'migconf migstart %d-%d\r\n' % (0, 8191)
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        # remote partial checkpoint
        util.log(">>> start remote checkpoint and load (%s)" % time.asctime())
        cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % (
                    src_master['ip'], src_master['redis_port'],
                    dst_master['ip'], dst_master['redis_port'],
                    0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None)

        ret = p.wait()
        for line in p.stdout:
            if line.find("Checkpoint Sequence Number:") != -1:
                util.log("seqnumber : " + line[line.rfind(":")+1:])
                seq = int(line[line.rfind(":")+1:])
            util.log(">>>" + str(line.rstrip()))

        self.assertEqual(0, ret)
        util.log(">>> end remote checkpoint and load (%s)" % time.asctime())

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        # bgsave for testing later about recovery during migration
        util.log(">>> bgsave for testing later about recovery during migration (%s)" % time.asctime())
        cmd = 'bgsave\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+Background saving started\r\n' )

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100)

        # remote catchup (smr log migration)
        util.log(">>> start remote catchup (%s)" % time.asctime())

        dst_host = dst_master['ip']
        dst_smr_port = dst_master['smr_base_port']
        rle = '1 8192'
        num_part = 8192

        smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port,
                                                     seq, tps, num_part, rle))
        response = smr.read_until('\r\n')
        if response[:3] != '+OK':
            util.log('failed to execute migrate start command, response:%s' % response)
            return False

        while True:
            smr.write('migrate info\r\n')
            response = smr.read_until('\r\n')
            seqs = response.split()
            logseq = int(seqs[1].split(':')[1])
            mig = int(seqs[2].split(':')[1])
            util.log('migrate info: %s' % response)
            if (logseq-mig < 500000):
                util.log('Remote catchup almost done. try mig2pc')
                break
            time.sleep(1)

        util.log(">>> sleep until 90 sec pass")
        self.assertFalse(time.time() - ts >= 90)
        time.sleep(90 - (time.time() - ts))

        res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20)

        util.log(">>> remote catchup phase almost done (%s)" % time.asctime())

        # mig2pc
        util.log(">>> start mig2pc (%s)" % time.asctime())

        cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id,
                                         0, 8191)
        result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
        util.log('mig2pc result : ' + result)
        if not result.startswith('{"state":"success","msg":"+OK"}\r\n'):
            util.log('failed to execute mig2pc command, result:%s' % result)
            return False

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10)
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20)

        # finish migration
        smr.write('migrate interrupt\r\n')
        response = smr.read_until('\r\n')
        util.log('migrate interrupt: %s' % response)
        smr.disconnect()

        # notify dst_redis of migration end
        util.log(">>> notify dst_redis of migration end (%s)" % time.asctime())

        cmd = 'migconf migend\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191)
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEquals( res, '+OK\r\n' )

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        # remote partial checkpoint
        util.log(">>> start rangedel (%s)" % time.asctime())
        cmd = "./cluster-util --rangedel %s %d %d-%d %d" % (
                    src_master['ip'], src_master['redis_port'],
                    0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None)
        ret = p.wait()

        for line in p.stdout:
            util.log(">>>" + str(line.rstrip()))

        cmd = 'migconf clearend\r\n'
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEqual(res, '+OK\r\n')

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')

        # kill dst_redis and recover from bgsave
        util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime())

        dst_redis.disconnect()
        ret = testbase.request_to_shutdown_redis(dst_master)
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        ret = testbase.request_to_shutdown_smr(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_master)
        self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_master['id'] )

        ret = testbase.request_to_start_redis(dst_master)
        self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_master['id']  )

        ret = testbase.wait_until_finished_to_set_up_role(dst_master)
        self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id']) )

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis, 'S3:PermanentKey')

        # kill dst_slave redis and recover without dump file
        util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime())
        dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id)

        ret = testbase.request_to_shutdown_redis(dst_slave)
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        ret = testbase.request_to_shutdown_smr(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_slave)
        self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_slave['id'] )

        ret = testbase.request_to_start_redis(dst_slave)
        self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_slave['id']  )

        ret = testbase.wait_until_finished_to_set_up_role(dst_slave)
        self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id']) )

        dst_redis_slave = redis_mgmt.Redis(dst_slave['id'])
        ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis_slave, 'S3:PermanentKey')

        # Go back to initial configuration
        self.assertTrue(util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000),
                'failed to rollback migration')
Пример #3
0
    def test_5_mgmt_is_isolated_with_lconn(self):
        util.print_frame()

        util.iptables_print_list()

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for loop_cnt in range(3):
            # Get master
            master = util.get_server_by_role_and_pg( cluster['servers'], 'master', 0 )

            first_slave = None
            for s in cluster['servers']:
                if s == master:
                    continue

                # Skip non-virtual host
                if s.has_key('real_ip') == False:
                    continue

                if first_slave == None:
                    first_slave = s

                # 'role lconn'
                util.log( 'role lconn pgs%d while hanging.' % s['id'] )

                ret = util.role_lconn_addr( s['real_ip'], s['smr_mgmt_port']  )
                self.assertEqual( ret, '+OK\r\n', 'role lconn failed. reply="%s"' % (ret[:-2]) )
                util.log( 'succeeded : cmd="role lconn", reply="%s"' % (ret[:-2]) )
                time.sleep(0.5)

            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100', first_slave['smr_mgmt_port']), 'add a bloking role to iptables fail.')

            for i in range(6):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(10):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] != first_slave['id']:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            self.assertTrue(util.iptables_drop('D', '127.0.0.100', first_slave['smr_mgmt_port']), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(ok, 'failed to check cluster state')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Пример #4
0
    def test_4_mgmt_is_isolated_with_red_failover(self):
        util.print_frame()

        util.iptables_print_list()

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port)

        # Master must be the first pgs, cluster['servers'][0].
        to_be_master = cluster['servers'][0]
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', to_be_master['pg_id'])
        master_id = -1
        if m['id'] != to_be_master['id']:
            try_cnt = 0
            while master_id != to_be_master['id'] and try_cnt < 20:
                master_id = util.role_change(cluster['servers'][0], cluster['cluster_name'], to_be_master['id'])
                try_cnt += 1
                time.sleep(1)
            self.assertEquals(master_id, to_be_master['id'], 'change %d to a master fail' % to_be_master['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for loop_cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            for s in cluster['servers']:
                self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            pgs_list = util.get_pgs_info_list(mgmt_ip, mgmt_port, cluster)
            reds = filter(lambda x: x['color'] == 'RED', pgs_list)

            # Shutdown
            server = cluster['servers'][random.choice(reds)['pgs_id']]
            util.log( 'shutdown pgs%d while hanging.' % server['id'] )
            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] )
            ret = testbase.request_to_shutdown_redis( server )
            self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] )

            # Check state F
            max_try = 20
            expected = 'F'
            for i in range( 0, max_try):
                util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port))
                state = util._get_smr_state( server['id'], cluster['cluster_name'], mgmt_ip, mgmt_port )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEqual( expected , state,
                               'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
            util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

            # Unblock network
            for s in cluster['servers']:
                self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(10):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == server['id']:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            # Recovery
            util.log( 'restart pgs%d.' % server['id'] )
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] )

            ret = testbase.request_to_start_redis( server )
            self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] )

            wait_count = 20
            ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
            self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

            redis = redis_mgmt.Redis( server['id'] )
            ret = redis.connect( server['ip'], server['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis' )

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(ok, 'failed to check cluster state')

            # Reset SMR option (slave_idle_timeout)
            t = telnet.Telnet('SMR%d' % server['id'])
            self.assertEqual(t.connect(server['ip'], server['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (server['ip'], server['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (server['ip'], server['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (server['ip'], server['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            if initial_state[i]['pgs_id'] == 1:
                self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Пример #5
0
    def test_3_some_pgs_is_isolated_2copy(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1_2copy', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Place master on real ip address
        for pg_id in [0, 1]:
            m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
            s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
            if m.has_key('ip') and m.has_key('real_ip'):
                if m['ip'] != m['real_ip']:
                    ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
                    self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 0 or s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != 'M' or s['mgmt_role'] != 'M':
                        state_transition_done = False

                    if s['quorum'] != 0:
                        state_transition_done = False

                if state_transition_done:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                if util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) == False:
                    time.sleep(1)
                    continue

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail.')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Пример #6
0
    def test_2_some_pgs_is_isolated(self):
        util.print_frame()

        util.iptables_print_list()

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Place master on virtual ip address in order to cause master election.
        pg_id = 0
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
        s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
        if m.has_key('ip') == True and m.has_key('real_ip') == False:
            ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
            self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Пример #7
0
    def test_migration_with_expire_command(self):
        util.print_frame()

        util.log("start load_generator")
        load_gen_thrd_list = {}
        for i in range(1):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        tps = 20000
        src_pg_id = 0
        dst_pg_id = 1
        leader_cm = self.cluster['servers'][0]
        src_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', src_pg_id)
        dst_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', dst_pg_id)

        smr = smr_mgmt.SMR(src_master['id'])
        ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port'])
        if ret != 0:
            util.log('failed to connect to smr(source master)')
            return False

        src_redis = redis_mgmt.Redis(src_master['id'])
        ret = src_redis.connect(src_master['ip'], src_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:persist', 20)

        self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        util.log(">>> migrate test with expire command start(%s), ts:%d" %
                 (time.asctime(), ts))

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:persist', 20)

        # notify dst_redis of migration start
        util.log(">>> notify dst_redis of migration start (%s)" %
                 time.asctime())

        cmd = 'migconf migstart %d-%d\r\n' % (0, 8191)
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        # remote partial checkpoint
        util.log(">>> start remote checkpoint and load (%s)" % time.asctime())
        cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], dst_master['ip'],
            dst_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)

        ret = p.wait()
        for line in p.stdout:
            if line.find("Checkpoint Sequence Number:") != -1:
                util.log("seqnumber : " + line[line.rfind(":") + 1:])
                seq = int(line[line.rfind(":") + 1:])
            util.log(">>>" + str(line.rstrip()))

        self.assertEqual(0, ret)
        util.log(">>> end remote checkpoint and load (%s)" % time.asctime())

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        # bgsave for testing later about recovery during migration
        util.log(
            ">>> bgsave for testing later about recovery during migration (%s)"
            % time.asctime())
        cmd = 'bgsave\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+Background saving started\r\n')

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist',
                          20)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist',
                          100)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:persist', 100)

        # remote catchup (smr log migration)
        util.log(">>> start remote catchup (%s)" % time.asctime())

        dst_host = dst_master['ip']
        dst_smr_port = dst_master['smr_base_port']
        rle = '1 8192'
        num_part = 8192

        smr.write('migrate start %s %d %d %d %d %s\r\n' %
                  (dst_host, dst_smr_port, seq, tps, num_part, rle))
        response = smr.read_until('\r\n')
        if response[:3] != '+OK':
            util.log('failed to execute migrate start command, response:%s' %
                     response)
            return False

        while True:
            smr.write('migrate info\r\n')
            response = smr.read_until('\r\n')
            seqs = response.split()
            logseq = int(seqs[1].split(':')[1])
            mig = int(seqs[2].split(':')[1])
            util.log('migrate info: %s' % response)
            if (logseq - mig < 500000):
                util.log('Remote catchup almost done. try mig2pc')
                break
            time.sleep(1)

        util.log(">>> sleep until 90 sec pass")
        self.assertFalse(time.time() - ts >= 90)
        time.sleep(90 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired',
                            10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist',
                            20)

        util.log(">>> remote catchup phase almost done (%s)" % time.asctime())

        # mig2pc
        util.log(">>> start mig2pc (%s)" % time.asctime())

        cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'],
                                         src_pg_id, dst_pg_id, 0, 8191)
        result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
        util.log('mig2pc result : ' + result)
        if not result.startswith('{"state":"success","msg":"+OK"}\r\n'):
            util.log('failed to execute mig2pc command, result:%s' % result)
            return False

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10)
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20)

        # finish migration
        smr.write('migrate interrupt\r\n')
        response = smr.read_until('\r\n')
        util.log('migrate interrupt: %s' % response)
        smr.disconnect()

        # notify dst_redis of migration end
        util.log(">>> notify dst_redis of migration end (%s)" % time.asctime())

        cmd = 'migconf migend\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191)
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        # remote partial checkpoint
        util.log(">>> start rangedel (%s)" % time.asctime())
        cmd = "./cluster-util --rangedel %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)
        ret = p.wait()

        for line in p.stdout:
            util.log(">>>" + str(line.rstrip()))

        cmd = 'migconf clearend\r\n'
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEqual(res, '+OK\r\n')

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')

        # kill dst_redis and recover from bgsave
        util.log(">>> kill dst_redis and recover from bgsave (%s)" %
                 time.asctime())

        dst_redis.disconnect()
        ret = testbase.request_to_shutdown_redis(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_master['id'])

        ret = testbase.request_to_start_redis(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_master['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_master)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_master['id']))

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis, 'S3:PermanentKey')

        # kill dst_slave redis and recover without dump file
        util.log(">>> kill dst_redis and recover without dump file (%s)" %
                 time.asctime())
        dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                   'slave', dst_pg_id)

        ret = testbase.request_to_shutdown_redis(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_slave['id'])

        ret = testbase.request_to_start_redis(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_slave['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_slave)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_slave['id']))

        dst_redis_slave = redis_mgmt.Redis(dst_slave['id'])
        ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis_slave, 'S3:PermanentKey')

        # Go back to initial configuration
        self.assertTrue(
            util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000),
            'failed to rollback migration')
Пример #8
0
    def test_2_some_pgs_is_isolated(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Place master on virtual ip address in order to cause master election.
        pg_id = 0
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
        s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
        if m.has_key('ip') == True and m.has_key('real_ip') == False:
            ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
            self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != s['mgmt_role']:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)