def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() log.info('deleting existing buckets on {0}'.format(servers)) for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except: log.info('15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() for bucket in buckets: status = rest.delete_bucket(bucket.name) if not status: try: log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format(bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format(bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion(bucket.name, rest, 200): try: log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def delete_bucket_or_assert(serverInfo, bucket='default', test_case=None): log = logger.Logger.get_logger() log.info('deleting existing bucket {0} on {1}'.format( bucket, serverInfo)) rest = RestConnection(serverInfo) if RestHelper(rest).bucket_exists(bucket): status = rest.delete_bucket(bucket) if not status: try: BucketOperationHelper.print_dataStorage_content( [serverInfo]) log.info( StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format( bucket, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format( bucket) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion( bucket, rest, 200): try: print_dataStorage_content([serverInfo]) log.info( StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except Exception as e: log.error(e) log.error( '15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() log.info('deleting existing buckets {0} on {1}'.format( [b.name for b in buckets], serverInfo.ip)) for bucket in buckets: log.info("remove bucket {0} ...".format(bucket.name)) try: status = rest.delete_bucket(bucket.name) except ServerUnavailableException as e: log.error(e) log.error( '5 seconds sleep before calling delete_bucket again...' ) time.sleep(5) status = rest.delete_bucket(bucket.name) if not status: try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format( bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format( bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion( bucket.name, rest, 200): try: BucketOperationHelper.print_dataStorage_content( servers) log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def _verify_checkpoint_id(self, param, stat_key, m_stats): timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 #verify checkpiont id increases on master node chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("New checkpoint not created") time.sleep(timeout / 10) # verify Master and all replicas are in sync with checkpoint ids m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] for server in self.servers: tasks.append( self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '==', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail( "Master and all replicas are NOT in sync with checkpoint ids" )
def checkpoint_server_down(self): """Load N items. Shut down server R2. Then Restart R2 and verify backfill happens on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp(self.replica2, self.replica3) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_server(self.replica2) time.sleep(5) data_load_thread = Thread(target=self._load_data_use_workloadgen, name="load_data", args=(self.master,)) data_load_thread.start() data_load_thread.join() self._start_server(self.replica2) time.sleep(5) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1, True) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2, True)
def checkpoint_create_time(self): """Load data, but let the timeout create a new checkpoint on all replicas""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(self.timeout)) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) chk_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self.log.info("Sleeping for {0} seconds)".format(self.timeout + 5)) time.sleep(self.timeout + 5) self._verify_checkpoint_id(param, stat_key, chk_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_collapse(self): """With 3 replicas, stop replication on R2, let Master and R1 close checkpoint. Run load until a new checkpoint is created on Master and R1. Wait till checkpoints merge on R1. Restart replication of R2. Checkpoint should advance to the latest on R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) self._stop_replication(self.replica2, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) tasks = [] chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '>=', self.num_items)) data_load_thread.join() for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(self.replica2, self.bucket) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '<', self.num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to replica2") self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_create_time(self): param = 'checkpoint' timeout = 60 stat_key = 'vb_0:open_checkpoint_id' master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(timeout)) chk_stats = StatsCommon.get_stats(self.servers[:self.num_servers], self.bucket, param, stat_key) load_thread = self.generate_load(master, self.bucket, 1) load_thread.join() log.info("Sleeping for {0} seconds)".format(timeout)) time.sleep(timeout) tasks = [] for server, value in chk_stats.items(): tasks.append( self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '>', value)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("New checkpoint not created") self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(600))
def checkpoint_create_items(self): """Load data until a new checkpoint is created on all replicas""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) chk_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self._verify_checkpoint_id(param, stat_key, chk_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def _verify_checkpoint_id(self, param, stat_key, m_stats): timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 #verify checkpiont id increases on master node chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("New checkpoint not created") time.sleep(timeout / 10) # verify Master and all replicas are in sync with checkpoint ids m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) tasks = [] for server in self.servers: tasks.append(self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '==', chk_pnt)) for task in tasks: try: task.result(timeout) except TimeoutError: self.fail("Master and all replicas are NOT in sync with checkpoint ids")
def _warmup_check_without_access_log(self): if not self.without_access_log: return True warmed_up = {} stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() warmed_up[bucket.name] = {} for server in self.servers: warmed_up[bucket.name][server] = False for bucket in self.buckets: for server in self.servers: start = time.time() while time.time() - start < self.timeout and not warmed_up[ bucket.name][server]: if stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_key_count')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_min_item_threshold')[server] or \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'mem_used')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, 'warmup', 'ep_warmup_min_memory_threshold')[server] or \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'mem_used')[server] >= \ stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'ep_mem_low_wat')[server]: warmed_up[bucket.name][server] = True else: self.log.info( "curr_items is %s and ep_warmup_min_item_threshold is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'curr_items')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_min_item_threshold')[server])) self.log.info( "vb_active_perc_mem_resident is %s and ep_warmup_min_memory_threshold is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'vb_active_perc_mem_resident')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_min_memory_threshold')[server])) self.log.info( "mem_used is %s and ep_mem_low_wat is %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'mem_used')[server], stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'ep_mem_low_wat')[server])) time.sleep(10) for bucket in self.buckets: for server in self.servers: if warmed_up[bucket.name][server] == True: continue elif warmed_up[bucket.name][server] == False: return False return True
def _get_backfill_timestamp(self, server, replica_server): param = 'tap' stat_key = 'eq_tapq:replication_ns_1@%s:backfill_start_timestamp' % ( replica_server.ip) m_stats = StatsCommon.get_stats([server], self.bucket, param, stat_key) self.log.info( "eq_tapq:replication_ns_1@%s:backfill_start_timestamp: %s" % (replica_server.ip, m_stats[m_stats.keys()[0]])) return int(m_stats[m_stats.keys()[0]])
def delete_bucket_or_assert(serverInfo, bucket='default', test_case=None): log = logger.Logger.get_logger() log.info('deleting existing buckets on {0}'.format(serverInfo)) rest = RestConnection(serverInfo) if RestHelper(rest).bucket_exists(bucket): status = rest.delete_bucket(bucket) if not status: try: log.info(StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format(bucket, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format(bucket) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion(bucket, rest, 200): try: log.info(StatsCommon.get_stats([serverInfo], bucket, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def checkpoint_collapse(self): param = 'checkpoint' chk_size = 5000 num_items = 25000 stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) slave1 = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, REPLICA1) slave2 = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, REPLICA2) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(chk_size)) m_stats = StatsCommon.get_stats([master], self.bucket, param, stat_key) self._stop_replication(slave2, self.bucket) load_thread = self.generate_load(master, self.bucket, num_items) load_thread.join() tasks = [] chk_pnt = str(int(m_stats[m_stats.keys()[0]]) + (num_items / chk_size)) tasks.append( self.cluster.async_wait_for_stats([master], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_chk_itms, '>=', str(num_items))) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(slave2, self.bucket) chk_pnt = str(int(m_stats[m_stats.keys()[0]]) + (num_items / chk_size)) tasks.append( self.cluster.async_wait_for_stats([slave2], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_chk_itms, '<', num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to secondary slave")
def checkpoint_replication_pause(self): """With 3 replicas load data. pause replication to R2. Let checkpoints close on Master and R1. Restart replication of R2 and R3, backfill should not be seen on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) time.sleep(5) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp( self.replica2, self.replica3) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._stop_replication(self.replica2, self.bucket) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not closed") data_load_thread.join() self._start_replication(self.replica2, self.bucket) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2)
def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except Exception as e: log.error(e) log.error('15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() log.info('deleting existing buckets {0} on {1}'.format([b.name for b in buckets], serverInfo.ip)) for bucket in buckets: log.info("remove bucket {0} ...".format(bucket.name)) try: status = rest.delete_bucket(bucket.name) except ServerUnavailableException as e: log.error(e) log.error('5 seconds sleep before calling delete_bucket again...') time.sleep(5) status = rest.delete_bucket(bucket.name) if not status: try: BucketOperationHelper.print_dataStorage_content(servers) log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format(bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format(bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion(bucket.name, rest, 200): try: BucketOperationHelper.print_dataStorage_content(servers) log.info(StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def test_verify_mb8825(self): # Setting up replication clusters. src_cluster_name, dest_cluster_name = "remote-dest-src", "remote-src-dest" self.__setup_replication_clusters(self.src_master, self.dest_master, src_cluster_name, dest_cluster_name) # Step-3 Load 10k items ( sets=80, deletes=20) on source cluster. self._load_all_buckets(self.src_master, self.gen_create, "create", 0) # Step-4 XDCR Source -> Remote self._replicate_clusters(self.src_master, dest_cluster_name) self.merge_buckets(self.src_master, self.dest_master, bidirection=False) # Step-5 Wait for replication to finish 50% at destination node expected_items = (self.gen_create.end) * 0.5 dest_master_buckets = self._get_cluster_buckets(self.dest_master) tasks = [] for bucket in dest_master_buckets: tasks.append(self.cluster.async_wait_for_stats([self.dest_master], bucket, '', 'curr_items', '>=', expected_items)) for task in tasks: task.result(self._timeout * 5) # Perform 20% delete on Source cluster. tasks = [] self.gen_delete = BlobGenerator('loadOne', 'loadOne-', self._value_size, start=0, end=int((self._num_items) * (float)(self._percent_delete) / 100)) tasks.extend(self._async_load_all_buckets(self.src_master, self.gen_delete, "delete", 0)) # Step-6 XDCR Remote -> Source self._replicate_clusters(self.dest_master, src_cluster_name) self.merge_buckets(self.dest_master, self.src_master, bidirection=False) # Wait for delete tasks to be finished for task in tasks: task.result() # Step-7 Wait for all the items to be replicated # Step-8 Compare the source and destination cluster items - item count, meta data, data content. self.sleep(self._timeout * 5) self.verify_results(verify_src=True) # Verify if no deletion performed at source node: src_master_buckets = self._get_cluster_buckets(self.src_master) for bucket in src_master_buckets: src_stat_ep_num_ops_del_meta = StatsCommon.get_stats([self.src_master], bucket, '', 'ep_num_ops_del_meta') src_stat_ep_num_ops_set_meta = StatsCommon.get_stats([self.src_master], bucket, '', 'ep_num_ops_set_meta') self.assertNotEqual(src_stat_ep_num_ops_set_meta, 0, "Number of set [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_set_meta, bucket)) self.assertNotEqual(src_stat_ep_num_ops_del_meta, 0, "Number of delete [%s] operation occurs at bucket = %s, while expected to 0" % (src_stat_ep_num_ops_del_meta, bucket)) if self.rep_type == "xmem": src_stat_ep_num_ops_del_meta_res_fail = StatsCommon.get_stats([self.src_master], bucket, '', 'ep_num_ops_del_meta_res_fail') src_stat_ep_num_ops_set_meta_res_fail = StatsCommon.get_stats([self.src_master], bucket, '', 'ep_num_ops_set_meta_res_fail') dest_stat_ep_num_ops_del_meta = StatsCommon.get_stats([self.dest_master], bucket, '', 'ep_num_ops_del_meta') self.assertNotEqual(src_stat_ep_num_ops_del_meta_res_fail, dest_stat_ep_num_ops_del_meta, "Number of failed delete [%s] operation occurs at bucket = %s, while expected to %s" % (src_stat_ep_num_ops_del_meta_res_fail, bucket, dest_stat_ep_num_ops_del_meta)) self.assertTrue(src_stat_ep_num_ops_set_meta_res_fail > 0, "Number of failed set [%s] operation occurs at bucket = %s, while expected greater than 0" % (src_stat_ep_num_ops_set_meta_res_fail, bucket)) elif self.rep_type == "capi": src_stat_ep_num_ops_get_meta = StatsCommon.get_stats([self.src_master], bucket, '', 'ep_num_ops_get_meta') self.assertTrue(src_stat_ep_num_ops_get_meta > 0, "Number of get [%s] operation occurs at bucket = %s, while expected greater than 0" % (src_stat_ep_num_ops_get_meta, bucket))
def delete_all_buckets_or_assert(servers, test_case): log = logger.Logger.get_logger() log.info('deleting existing buckets on {0}'.format(servers)) for serverInfo in servers: rest = RestConnection(serverInfo) buckets = [] try: buckets = rest.get_buckets() except: log.info( '15 seconds sleep before calling get_buckets again...') time.sleep(15) buckets = rest.get_buckets() for bucket in buckets: status = rest.delete_bucket(bucket.name) if not status: try: log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") log.info('deleted bucket : {0} from {1}'.format( bucket.name, serverInfo.ip)) msg = 'bucket "{0}" was not deleted even after waiting for two minutes'.format( bucket.name) if test_case: if not BucketOperationHelper.wait_for_bucket_deletion( bucket.name, rest, 200): try: log.info( StatsCommon.get_stats([serverInfo], bucket.name, "timings")) except: log.error("Unable to get timings for bucket") test_case.fail(msg)
def checkpoint_collapse(self): """With 3 replicas, stop replication on R2, let Master and R1 close checkpoint. Run load until a new checkpoint is created on Master and R1. Wait till checkpoints merge on R1. Restart replication of R2. Checkpoint should advance to the latest on R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) self._stop_replication(self.replica2, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) tasks = [] chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '>=', self.num_items)) data_load_thread.join() for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(self.replica2, self.bucket) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_chk_itms, '<', self.num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to replica2") self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_server_down(self): """Load N items. Shut down server R2. Then Restart R2 and verify backfill happens on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp( self.replica2, self.replica3) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_server(self.replica2) time.sleep(5) data_load_thread = Thread(target=self._load_data_use_workloadgen, name="load_data", args=(self.master, )) data_load_thread.start() data_load_thread.join() self._start_server(self.replica2) time.sleep(5) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1, True) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2, True)
def checkpoint_replication_pause_failover(self): """Load N items. Stop replication R3. Load N' more items. Failover R2. When restart replication to R3, verify backfill doesn't happen on R1.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) nodes = rest.node_statuses() failover_node = None for node in nodes: if node.id.find(self.replica2.ip) >= 0: failover_node = node self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_replication(self.replica3, self.bucket) generate_load_two = BlobGenerator('sqlite', 'sqlite-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load_two, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() failed_over = rest.fail_over(failover_node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(failover_node.id) self.assertTrue(failed_over, "unable to failover node %s".format(self.replica2.ip)) self.log.info("failed over node : {0}".format(failover_node.id)) data_load_thread.join() self._start_replication(self.replica3, self.bucket) self.servers = [] self.servers = [self.master, self.replica1, self.replica3] self.num_servers = len(self.servers) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self.cluster.rebalance([self.master, self.replica1, self.replica2, self.replica3], [], [self.replica2]) self.cluster.rebalance([self.master, self.replica1, self.replica3], [self.replica2], [])
def checkpoint_deduplication(self): """Disable replication of R1. Load N items to master, then mutate some of them. Restart replication of R1, only N items should be in stats. In this test, we can only load number of items <= checkpoint_size to observe deduplication""" param = 'checkpoint' stat_key = 'vb_0:num_open_checkpoint_items' stat_key_id = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) self._stop_replication(self.replica1, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) generate_update = BlobGenerator('nosql', 'sql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets([self.master, self.replica2, self.replica3]) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key_id) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_update, "update", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._start_replication(self.replica1, self.bucket) data_load_thread.join() chk_pnt = int(m_stats[m_stats.keys()[0]]) timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 time.sleep(timeout) tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '==', self.num_items)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '==', self.num_items)) tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key_id, '==', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key_id, '==', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Items weren't deduplicated") self._verify_stats_all_buckets(self.servers[:self.num_servers])
def checkpoint_collapse(self): param = 'checkpoint' chk_size = 5000 num_items = 25000 stat_key = 'vb_0:last_closed_checkpoint_id' stat_chk_itms = 'vb_0:num_checkpoint_items' master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) slave1 = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, REPLICA1) slave2 = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, REPLICA2) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(chk_size)) m_stats = StatsCommon.get_stats([master], self.bucket, param, stat_key) self._stop_replication(slave2, self.bucket) load_thread = self.generate_load(master, self.bucket, num_items) load_thread.join() tasks = [] chk_pnt = str(int(m_stats[m_stats.keys()[0]]) + (num_items / chk_size)) tasks.append(self.cluster.async_wait_for_stats([master], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_chk_itms, '>=', str(num_items))) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not collapsed") tasks = [] self._start_replication(slave2, self.bucket) chk_pnt = str(int(m_stats[m_stats.keys()[0]]) + (num_items / chk_size)) tasks.append(self.cluster.async_wait_for_stats([slave2], self.bucket, param, stat_key, '==', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([slave1], self.bucket, param, stat_chk_itms, '<', num_items)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoints not replicated to secondary slave")
def checkpoint_create_items(self): param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' num_items = 6000 master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, '5000') chk_stats = StatsCommon.get_stats(self.servers[:self.num_servers], self.bucket, param, stat_key) load_thread = self.generate_load(master, self.bucket, num_items) load_thread.join() tasks = [] for server, value in chk_stats.items(): tasks.append(self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '>', value)) for task in tasks: try: timeout = 30 if (num_items * .001) < 30 else num_items * .001 task.result(timeout) except TimeoutError: self.fail("New checkpoint not created")
def checkpoint_replication_pause(self): """With 3 replicas load data. pause replication to R2. Let checkpoints close on Master and R1. Restart replication of R2 and R3, backfill should not be seen on R1 and R2.""" param = 'checkpoint' stat_key = 'vb_0:last_closed_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, str(self.checkpoint_size)) time.sleep(5) prev_backfill_timestamp_R1 = self._get_backfill_timestamp(self.replica1, self.replica2) prev_backfill_timestamp_R2 = self._get_backfill_timestamp(self.replica2, self.replica3) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._stop_replication(self.replica2, self.bucket) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) chk_pnt = int(m_stats[m_stats.keys()[0]]) + 2 tasks = [] tasks.append(self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '>=', chk_pnt)) tasks.append(self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '>=', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Checkpoint not closed") data_load_thread.join() self._start_replication(self.replica2, self.bucket) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self._verify_backfill_happen(self.replica2, self.replica3, prev_backfill_timestamp_R2)
def checkpoint_create_time(self): param = 'checkpoint' timeout = 60 stat_key = 'vb_0:open_checkpoint_id' master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(timeout)) chk_stats = StatsCommon.get_stats(self.servers[:self.num_servers], self.bucket, param, stat_key) load_thread = self.generate_load(master, self.bucket, 1) load_thread.join() log.info("Sleeping for {0} seconds)".format(timeout)) time.sleep(timeout) tasks = [] for server, value in chk_stats.items(): tasks.append(self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '>', value)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("New checkpoint not created") self._set_checkpoint_timeout(self.servers[:self.num_servers], self.bucket, str(600))
def _create_access_log(self): stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() for bucket in self.buckets: for server in self.servers: scanner_runs = stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'ep_num_access_scanner_runs')[server] self.log.info( "current access scanner run for %s in bucket %s is %s times" % (server.ip, bucket.name, scanner_runs)) self.log.info( "setting access scanner time %s minutes for %s in bucket %s" % (self.access_log_time, server.ip, bucket.name)) ClusterOperationHelper.flushctl_set(server, "alog_sleep_time", self.access_log_time, bucket.name) if not self._wait_for_access_run( self.access_log_time, scanner_runs, server, bucket, stats_all_buckets[bucket.name]): self.fail( "Not able to create access log within %s minutes" % self.access_log_time)
def checkpoint_create_items(self): param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' num_items = 6000 master = self._get_server_by_state(self.servers[:self.num_servers], self.bucket, ACTIVE) self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, '5000') chk_stats = StatsCommon.get_stats(self.servers[:self.num_servers], self.bucket, param, stat_key) load_thread = self.generate_load(master, self.bucket, num_items) load_thread.join() tasks = [] for server, value in chk_stats.items(): tasks.append( self.cluster.async_wait_for_stats([server], self.bucket, param, stat_key, '>', value)) for task in tasks: try: timeout = 30 if (num_items * .001) < 30 else num_items * .001 task.result(timeout) except TimeoutError: self.fail("New checkpoint not created")
def test_items_append(self): self.desired_item_size = self.input.param("desired_item_size", 2048) self.append_size = self.input.param("append_size", 1024) self.fixed_append_size = self.input.param("fixed_append_size", True) self.append_ratio = self.input.param("append_ratio", 0.5) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=100) for bucket in self.buckets: verify_dict = {} vkeys, dkeys = bucket.kvs[1].key_set() key_count = len(vkeys) app_ratio = self.append_ratio * key_count selected_keys = [] i = 0 for key in vkeys: i += 1 if i >= app_ratio: break selected_keys.append(key) awareness = VBucketAwareMemcached(RestConnection(self.master), bucket.name) if self.kv_verify: for key in selected_keys: value = awareness.memcached(key).get(key)[2] verify_dict[key] = value self.log.info("Bucket: {0}".format(bucket.name)) self.log.info("Appending to have items whose initial size was " + "{0} to equal or cross a size of {1}".format(self.value_size, self.desired_item_size)) self.log.info("Item-appending of {0} items starting ..".format(len(selected_keys)+1)) index = 3 while self.value_size < self.desired_item_size: str_len = self.append_size if not self.fixed_append_size: str_len = int(math.pow(2, index)) for key in selected_keys: random_string = self.random_str_generator(str_len) awareness.memcached(key).append(key, random_string) if self.kv_verify: verify_dict[key] = verify_dict[key] + random_string self.value_size += str_len index += 1 self.log.info("The appending of {0} items ended".format(len(selected_keys)+1)) msg = "Bucket:{0}".format(bucket.name) self.log.info("VERIFICATION <" + msg + ">: Phase 0 - Check the gap between " + "mem_used by the bucket and total_allocated_bytes") stats = StatsCommon() mem_used_stats = stats.get_stats(self.servers, bucket, 'memory', 'mem_used') total_allocated_bytes_stats = stats.get_stats(self.servers, bucket, 'memory', 'total_allocated_bytes') total_fragmentation_bytes_stats = stats.get_stats(self.servers, bucket, 'memory', 'total_fragmentation_bytes') for server in self.servers: self.log.info("In {0} bucket {1}, total_fragmentation_bytes + the total_allocated_bytes = {2}" .format(server.ip, bucket.name, (int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server])))) self.log.info("In {0} bucket {1}, mem_used = {2}".format(server.ip, bucket.name, mem_used_stats[server])) self.log.info("In {0} bucket {1}, the difference between acutal memory used by memcached and mem_used is {2} times" .format(server.ip, bucket.name, float(int(total_fragmentation_bytes_stats[server]) + int(total_allocated_bytes_stats[server]))/float(mem_used_stats[server]))) self.log.info("VERIFICATION <" + msg + ">: Phase1 - Check if any of the " + "selected keys have value less than the desired value size") for key in selected_keys: value = awareness.memcached(key).get(key)[2] if len(value) < self.desired_item_size: self.fail("Failed to append enough to make value size surpass the " + "size, for key {0}".format(key)) if self.kv_verify: self.log.info("VERIFICATION <" + msg + ">: Phase2 - Check if the content " + "after the appends match whats expected") for k in verify_dict: if awareness.memcached(key).get(k)[2] != verify_dict[k]: self.fail("Content at key {0}: not what's expected.".format(k)) self.log.info("VERIFICATION <" + msg + ">: Successful")
def _load_dgm(self): generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=20000, pause_secs=5, timeout_secs=180) self.load_gen_list.append(generate_load) stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() for bucket in self.buckets: threshold_reached = False while not threshold_reached: for server in self.servers: active_resident = stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'vb_active_perc_mem_resident')[server] if int(active_resident) > self.active_resident_threshold: self.log.info( "resident ratio is %s greater than %s for %s in bucket %s. Continue loading to the cluster" % (active_resident, self.active_resident_threshold, server.ip, bucket.name)) random_key = key_generator() generate_load = BlobGenerator(random_key, '%s-' % random_key, self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=20000, pause_secs=5, timeout_secs=180) self.load_gen_list.append(generate_load) else: threshold_reached = True self.log.info( "DGM state achieved for %s in bucket %s!" % (server.ip, bucket.name)) break if (self.doc_ops is not None): if ("update" in self.doc_ops): for gen in self.load_gen_list[:int( len(self.load_gen_list) * 0.5)]: self._load_all_buckets(self.master, gen, "update", 0, 1, 0, True, batch_size=20000, pause_secs=5, timeout_secs=180) if ("delete" in self.doc_ops): for gen in self.load_gen_list[ int(len(self.load_gen_list) * 0.5):]: self._load_all_buckets(self.master, gen, "delete", 0, 1, 0, True, batch_size=20000, pause_secs=5, timeout_secs=180) if ("expire" in self.doc_ops): for gen in self.load_gen_list[:int( len(self.load_gen_list) * 0.8)]: self._load_all_buckets(self.master, gen, "update", self.expire_time, 1, 0, True, batch_size=20000, pause_secs=5, timeout_secs=180) time.sleep(self.expire_time * 2) for server in self.servers: shell = RemoteMachineShellConnection(server) for bucket in self.buckets: shell.execute_cbepctl(bucket, "", "set flush_param", "exp_pager_stime", 5) shell.disconnect() time.sleep(30)
def checkpoint_replication_pause_failover(self): """Load N items. Stop replication R3. Load N' more items. Failover R2. When restart replication to R3, verify backfill doesn't happen on R1.""" param = 'checkpoint' stat_key = 'vb_0:open_checkpoint_id' rest = RestConnection(self.master) nodes = rest.node_statuses() failover_node = None for node in nodes: if node.id.find(self.replica2.ip) >= 0: failover_node = node self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) generate_load_one = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load_one, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) prev_backfill_timestamp_R1 = self._get_backfill_timestamp( self.replica1, self.replica2) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key) self._stop_replication(self.replica3, self.bucket) generate_load_two = BlobGenerator('sqlite', 'sqlite-', self.value_size, end=self.num_items) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_load_two, "create", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() failed_over = rest.fail_over(failover_node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(failover_node.id) self.assertTrue(failed_over, "unable to failover node %s".format(self.replica2.ip)) self.log.info("failed over node : {0}".format(failover_node.id)) data_load_thread.join() self._start_replication(self.replica3, self.bucket) self.servers = [] self.servers = [self.master, self.replica1, self.replica3] self.num_servers = len(self.servers) self._verify_checkpoint_id(param, stat_key, m_stats) self._verify_stats_all_buckets(self.servers[:self.num_servers]) self._verify_backfill_happen(self.replica1, self.replica2, prev_backfill_timestamp_R1) self.cluster.rebalance( [self.master, self.replica1, self.replica2, self.replica3], [], [self.replica2]) self.cluster.rebalance([self.master, self.replica1, self.replica3], [self.replica2], [])
def checkpoint_deduplication(self): """Disable replication of R1. Load N items to master, then mutate some of them. Restart replication of R1, only N items should be in stats. In this test, we can only load number of items <= checkpoint_size to observe deduplication""" param = 'checkpoint' stat_key = 'vb_0:num_open_checkpoint_items' stat_key_id = 'vb_0:open_checkpoint_id' self._set_checkpoint_size(self.servers[:self.num_servers], self.bucket, self.checkpoint_size) self._stop_replication(self.replica1, self.bucket) generate_load = BlobGenerator('nosql', 'nosql-', self.value_size, end=self.num_items) generate_update = BlobGenerator('nosql', 'sql-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, generate_load, "create", 0, 1, 0, True, batch_size=self.checkpoint_size, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets( [self.master, self.replica2, self.replica3]) m_stats = StatsCommon.get_stats([self.master], self.bucket, param, stat_key_id) data_load_thread = Thread(target=self._load_all_buckets, name="load_data", args=(self.master, generate_update, "update", 0, 1, 0, True, self.checkpoint_size, 5, 180)) data_load_thread.start() self._start_replication(self.replica1, self.bucket) data_load_thread.join() chk_pnt = int(m_stats[m_stats.keys()[0]]) timeout = 60 if (self.num_items * .001) < 60 else self.num_items * .001 time.sleep(timeout) tasks = [] tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key, '==', self.num_items)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key, '==', self.num_items)) tasks.append( self.cluster.async_wait_for_stats([self.master], self.bucket, param, stat_key_id, '==', chk_pnt)) tasks.append( self.cluster.async_wait_for_stats([self.replica1], self.bucket, param, stat_key_id, '==', chk_pnt)) for task in tasks: try: task.result(60) except TimeoutError: self.fail("Items weren't deduplicated") self._verify_stats_all_buckets(self.servers[:self.num_servers])
def _warmup_check(self): warmed_up = {} stats_all_buckets = {} for bucket in self.buckets: stats_all_buckets[bucket.name] = StatsCommon() warmed_up[bucket.name] = {} for server in self.servers: warmed_up[bucket.name][server] = False for bucket in self.buckets: for server in self.servers: start = time.time() warmup_complete = False while not warmup_complete: try: if stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_thread')[server] == "complete": self.log.info( "warmup completed for %s in bucket %s" % (server.ip, bucket.name)) warmup_complete = True elif stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_thread')[server] == "running": self.log.info( "warming up is still running for %s in bucket %s....curr_items_tot : %s" % (server.ip, bucket.name, stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'curr_items_tot')[server])) warmup_time = int( stats_all_buckets[bucket.name].get_stats( [server], bucket, 'warmup', 'ep_warmup_time')[server]) if warmup_time is not None: self.log.info( "ep_warmup_time is %s for %s in bucket %s" % (warmup_time, server.ip, bucket.name)) except Exception as e: self.log.error( "Could not get warmup_time stats from server %s:%s, exception %s" % (server.ip, server.port, e)) start = time.time() while time.time() - start < self.timeout and not warmed_up[ bucket.name][server]: if stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items_tot')[server] == \ self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items_tot"]: if stats_all_buckets[bucket.name].get_stats([server], bucket, '', 'curr_items')[server] == \ self.pre_warmup_stats[bucket.name]["%s:%s" % (server.ip, server.port)]["curr_items"]: if self._warmup_check_without_access_log(): warmed_up[bucket.name][server] = True self._stats_report( server, bucket, stats_all_buckets[bucket.name]) else: self.log.info( "curr_items is %s not equal to %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'curr_items')[server], self.pre_warmup_stats[bucket.name][ "%s:%s" % (server.ip, server.port)]["curr_items"])) else: self.log.info( "curr_items_tot is %s not equal to %s" % (stats_all_buckets[bucket.name].get_stats( [server], bucket, '', 'curr_items_tot')[server], self.pre_warmup_stats[bucket.name][ "%s:%s" % (server.ip, server.port)]["curr_items_tot"])) time.sleep(10) for bucket in self.buckets: for server in self.servers: if warmed_up[bucket.name][server] == True: continue elif warmed_up[bucket.name][server] == False: return False return True
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = StatsCommon.get_stats([master], bucket, "", "pid")[master] command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) BaseTestCase._wait_warmup_completed(self, [master], bucket, wait_time=600) time.sleep(5) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def _get_backfill_timestamp(self, server, replica_server): param = 'tap' stat_key = 'eq_tapq:replication_ns_1@%s:backfill_start_timestamp' % (replica_server.ip) m_stats = StatsCommon.get_stats([server], self.bucket, param, stat_key) self.log.info("eq_tapq:replication_ns_1@%s:backfill_start_timestamp: %s" % (replica_server.ip, m_stats[m_stats.keys()[0]])) return int(m_stats[m_stats.keys()[0]])
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info( "FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = StatsCommon.get_stats([master], bucket, "", "pid")[master] command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format( master.ip, master.port, killed)) BaseTestCase._wait_warmup_completed(self, [master], bucket, wait_time=600) time.sleep(5) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def stat(self, key): stats = StatsCommon.get_stats([self.master], 'default', "", key) val = stats.values()[0] if val.isdigit(): val = int(val) return val