def build_job_and_wait_completion(self, name): """start job first time Need to start job first time so jenkins will read all job parameters Job started one by one, to avoid situation when all jenkins resources will be allocated """ LOGGER.info("Start first build %s", name) job_id = self.jenkins.build_job(name) # wait while worker will be found def check_job_is_started(job_id): return self.jenkins.get_queue_item(job_id).get("executable") wait_for(check_job_is_started, step=5, text="Job is starting", timeout=60, throw_exc=True, job_id=job_id) # wait while job will be executed def check_job_is_finished(job_name): return not self.jenkins.get_build_info(job_name, 1).get("building") wait_for(check_job_is_finished, step=5, text="Check job is finished", timeout=120, throw_exc=True, job_name=name) LOGGER.info("First build finished")
def _retrieve(self, since): wait.wait_for(self._file_exists, step=10, timeout=600, throw_exc=True, file_path='/var/log/syslog') super()._retrieve(since)
def _wait_for_preinstalled_scylla(node): def scylla_ami_setup_done(): """ Scylla-ami-setup will update config files and trigger to start the scylla-server service. `--stop-services` parameter in ec2 user-data, not really stop running scylla-server service, but deleting a flag file (/etc/scylla/ami_disabled) in first start of scylla-server (by scylla_prepare), and fail the first start. We use this function to make sure scylla-ami-setup finishes, and first start is done (fail as expected, /etc/scylla/ami_disabled is deleted). Then it won't effect reconfig in SCT. The fllowing two examples are different opportunity to help understand. # opportunity 1: scylla-ami-setup finishes: result = node.remoter.run('systemctl status scylla-ami-setup', ignore_status=True) return 'Started Scylla AMI Setup' in result.stdout # opportunity 2: flag file is deleted in scylla_prepare: result = node.remoter.run('test -e /etc/scylla/ami_disabled', ignore_status=True) return result.exit_status != 0 """ # make sure scylla-ami-setup finishes, flag file is deleted, and first start fails as expected. result = node.remoter.run('systemctl status scylla-server', ignore_status=True) return 'Failed to start Scylla Server.' in result.stdout wait.wait_for(scylla_ami_setup_done, step=10, timeout=300)
def docker_scylla(): # make sure the path to the file is base on the host path, and not as the docker internal path i.e. /sct/ # since we are going to mount it in a DinD (docker-inside-docker) setup base_dir = os.environ.get("_SCT_BASE_DIR", None) entryfile_path = Path(base_dir) if base_dir else Path(__file__).parent.parent entryfile_path = entryfile_path.joinpath('./docker/scylla-sct/entry.sh') alternator_flags = "--alternator-port 8000 --alternator-write-isolation=always" docker_version = "scylladb/scylla:4.1.0" scylla = RemoteDocker(LocalNode(), image_name=docker_version, command_line=f"--smp 1 --experimental 1 {alternator_flags}", extra_docker_opts=f'-p 8000 -p 9042 --cpus="1" -v {entryfile_path}:/entry.sh --entrypoint' f' /entry.sh') def db_up(): try: # check that port is taken result_netstat = scylla.run("nodetool status | grep '^UN '", verbose=False, ignore_status=True) return result_netstat.exit_status == 0 except Exception as details: # pylint: disable=broad-except logging.error("Error checking for scylla up normal: %s", details) return False wait.wait_for(func=db_up, step=1, text='Waiting for DB services to be up', timeout=30, throw_exc=True) yield scylla scylla.kill()
def __init__(self, domain, hypervisor, parent_cluster, node_prefix='node', node_index=1, domain_username='******', domain_password='', base_logdir=None): name = '%s-%s' % (node_prefix, node_index) self._backing_image = None self._domain = domain self._hypervisor = hypervisor wait.wait_for(self._domain.isActive) self._wait_public_ip() ssh_login_info = { 'hostname': None, 'user': domain_username, 'password': domain_password } super(LibvirtNode, self).__init__(name=name, parent_cluster=parent_cluster, ssh_login_info=ssh_login_info, base_logdir=base_logdir, node_prefix=node_prefix)
def wait_till_api_become_not_operational(self, kluster, num_requests=10, max_waiting_time=360): wait_for( self.check_if_api_not_operational, timeout=max_waiting_time, kluster=kluster, num_requests=num_requests, throw_exc=False )
def wait_for_healthchecks(self): wait_for( func=self.are_healthchecks_done, text="Waiting for the healthchecks to have 'DONE' status", step=3, timeout=300, throw_exc=True, )
def wait_until_user_table_exists(self, db_node, table_name: str = 'random', timeout_min: int = 20): text = f'Waiting until {table_name} user table exists' if table_name.lower() == 'random': wait.wait_for(func=lambda: len(self.db_cluster.get_non_system_ks_cf_list(db_node)) > 0, step=60, text=text, timeout=60 * timeout_min, throw_exc=True) else: wait.wait_for(func=lambda: table_name in (self.db_cluster.get_non_system_ks_cf_list(db_node)), step=60, text=text, timeout=60 * timeout_min, throw_exc=True)
def wait_till_api_become_stable(self, kluster, num_requests=20, max_waiting_time=1200): wait_for( self.check_if_api_stable, timeout=max_waiting_time, kluster=kluster, num_requests=num_requests, throw_exc=True )
def _wait_ssh_up(self, verbose=True, timeout=500): text = None if verbose: text = '%s: Waiting for SSH to be up' % self wait.wait_for(func=self._remoter.is_up, step=10, text=text, timeout=timeout, throw_exc=True)
def test_01_simple(self): calls = [] def callback(arg1, arg2): calls.append((arg1, arg2)) raise Exception("error") wait_for(callback, timeout=1, step=0.5, arg1=1, arg2=3) self.assertEqual(len(calls), 3)
def test_03_false_return(self): calls = [] def callback(arg1, arg2): calls.append((arg1, arg2)) return False wait_for(callback, timeout=1, step=0.5, arg1=1, arg2=3) self.assertEqual(len(calls), 3)
def remote_file(remoter, remote_path, serializer=StringIO.getvalue, deserializer=read_to_stringio, sudo=False, preserve_ownership=True, preserve_permissions=True): filename = os.path.basename(remote_path) local_tempfile = os.path.join(tempfile.mkdtemp(prefix='sct'), filename) if preserve_ownership: ownership = remoter.sudo(cmd='stat -c "%U:%G" ' + remote_path).stdout.strip() if preserve_permissions: permissions = remoter.sudo(cmd='stat -c "%a" ' + remote_path).stdout.strip() wait.wait_for( remoter.receive_files, step=10, text=f"Waiting for copying `{remote_path}' from {remoter.hostname}", timeout=300, throw_exc=True, src=remote_path, dst=local_tempfile) with open(local_tempfile, encoding="utf-8") as fobj: parsed_data = deserializer(fobj) yield parsed_data content = serializer(parsed_data) with open(local_tempfile, "w", encoding="utf-8") as fobj: fobj.write(content) LOGGER.debug("New content of `%s':\n%s", remote_path, content) remote_tempfile = remoter.run("mktemp").stdout.strip() remote_tempfile_move_cmd = f"mv '{remote_tempfile}' '{remote_path}'" wait.wait_for( remoter.send_files, step=10, text=f"Waiting for updating of `{remote_path}' on {remoter.hostname}", timeout=300, throw_exc=True, src=local_tempfile, dst=remote_tempfile) if sudo: remoter.sudo(remote_tempfile_move_cmd) else: remoter.run(remote_tempfile_move_cmd) if preserve_ownership: remoter.sudo(f"chown {ownership} {remote_path}") if preserve_permissions: remoter.sudo(f"chmod {permissions} {remote_path}") os.unlink(local_tempfile)
def docker_scylla(): # make sure the path to the file is base on the host path, and not as the docker internal path i.e. /sct/ # since we are going to mount it in a DinD (docker-inside-docker) setup base_dir = os.environ.get("_SCT_BASE_DIR", None) entryfile_path = Path(base_dir) if base_dir else Path( __file__).parent.parent entryfile_path = entryfile_path.joinpath('./docker/scylla-sct/entry.sh') alternator_flags = "--alternator-port 8000 --alternator-write-isolation=always" docker_version = "scylladb/scylla-nightly:666.development-0.20201015.8068272b466" cluster = LocalScyllaClusterDummy() scylla = RemoteDocker( LocalNode("scylla", cluster), image_name=docker_version, command_line=f"--smp 1 --experimental 1 {alternator_flags}", extra_docker_opts= f'-p 8000 -p 9042 --cpus="1" -v {entryfile_path}:/entry.sh --entrypoint' f' /entry.sh') DummyRemoter = collections.namedtuple('DummyRemoter', 'run') scylla.remoter = DummyRemoter(run=scylla.run) def db_up(): try: return scylla.is_port_used(port=BaseNode.CQL_PORT, service_name="scylla-server") except Exception as details: # pylint: disable=broad-except logging.error("Error checking for scylla up normal: %s", details) return False def db_alternator_up(): try: return scylla.is_port_used(port=8000, service_name="scylla-server") except Exception as details: # pylint: disable=broad-except logging.error("Error checking for scylla up normal: %s", details) return False wait.wait_for(func=db_up, step=1, text='Waiting for DB services to be up', timeout=30, throw_exc=True) wait.wait_for(func=db_alternator_up, step=1, text='Waiting for DB services to be up alternator)', timeout=30, throw_exc=True) yield scylla scylla.kill()
def test_start_get_events_main_device(self): self.assertIsNone( get_events_main_device(_registry=self.events_processes_registry)) start_events_main_device(_registry=self.events_processes_registry) events_device = get_events_main_device( _registry=self.events_processes_registry) wait_for(func=events_device.is_alive, timeout=5) try: self.assertIsInstance(events_device, EventsDevice) self.assertEqual(events_device.events_counter, 0) self.assertTrue(events_device.is_alive()) self.assertTrue(events_device.subscribe_address) finally: events_device.stop(timeout=1)
def create_repair_task( self, dc_list=None, # pylint: disable=too-many-arguments,arguments-differ keyspace=None, interval=None, num_retries=None, fail_fast=None, intensity=None, parallel=None, name=None) -> RepairTask: # NOTE: wait for the 'healthcheck' tasks be 'DONE' before starting the repair one. self.wait_for_healthchecks() # TBD: After https://github.com/scylladb/scylla-operator/issues/272 is solved, # replace RepairTask with ScyllaOperatorRepairTask and move relate logic there so_task = self._create_scylla_operator_repair_task( dc_list=dc_list, keyspace=keyspace, interval=interval, num_retries=num_retries, fail_fast=fail_fast, intensity=intensity, parallel=parallel, name=name) return wait_for(lambda: self.get_mgr_repair_task(so_task), step=2, timeout=300)
def create_repair_task( self, dc_list=None, # pylint: disable=too-many-arguments,arguments-differ keyspace=None, interval=None, num_retries=None, fail_fast=None, intensity=None, parallel=None, name=None) -> RepairTask: # TBD: After https://github.com/scylladb/scylla-operator/issues/272 is solved, # replace RepairTask with ScyllaOperatorRepairTask and move relate logic there so_task = self._create_scylla_operator_repair_task( dc_list=dc_list, keyspace=keyspace, interval=interval, num_retries=num_retries, fail_fast=fail_fast, intensity=intensity, parallel=parallel, name=name) return wait_for( lambda: self.get_mgr_repair_task_by_id( self.wait_for_operator_repair_task_status(so_task).mgmt_task_id ), timeout=120, )
def create_backup_task(self, dc_list=None, dry_run=None, interval=None, keyspace_list=None, cron=None, location_list=None, num_retries=None, rate_limit_list=None, retention=None, show_tables=None, snapshot_parallel_list=None, start_date=None, upload_parallel_list=None, legacy_args=None) -> BackupTask: so_task = self._create_operator_backup_task( dc_list=dc_list, interval=interval, keyspace_list=keyspace_list, cron=cron, location_list=location_list, num_retries=num_retries, rate_limit_list=rate_limit_list, retention=retention, snapshot_parallel_list=snapshot_parallel_list, start_date=start_date, upload_parallel_list=upload_parallel_list, ) return wait_for( lambda: self.get_mgr_backup_task_by_id( self.wait_for_operator_backup_task_status(so_task).mgmt_task_id ), timeout=120, )
def deploy(self): events = KubernetesOps.watch_events(self.core_v1_api, name=self.name, namespace=self.namespace) # Delete old service if it exists from predecessor node with same name self.delete() LOGGER.debug("Trying to create '%s' K8S service in the '%s' namespace", self.name, self.namespace) self.core_v1_api.create_namespaced_service( namespace=self.namespace, body=self.service_definition) service_hostname = wait_for(self.get_service_hostname, timeout=300, throw_exc=False) if not service_hostname: error_message = "Failed to create load balancer %s, \n" \ "it can happen due to the lack of ip addresses in subnet, \n" \ "or due to the reaching limits on load balancers quota" if events: error_message += ', last events:\n' + ('\n'.join( [event['object'].message for event in events])) raise RuntimeError(error_message, self.name) service_ip = self.get_service_ip() if not service_ip: raise RuntimeError( f"Failed to resolve hostname {service_hostname} for load balancer {self.name}" ) self.service_hostname = service_hostname self.service_ip = service_ip self.is_deployed = True
def create_backup_task(self, dc_list=None, dry_run=None, interval=None, keyspace_list=None, cron=None, location_list=None, num_retries=None, rate_limit_list=None, retention=None, show_tables=None, snapshot_parallel_list=None, start_date=None, upload_parallel_list=None, legacy_args=None) -> BackupTask: # NOTE: wait for the 'healthcheck' tasks be 'DONE' before starting the backup one. self.wait_for_healthchecks() so_task = self._create_operator_backup_task( dc_list=dc_list, interval=interval, keyspace_list=keyspace_list, cron=cron, location_list=location_list, num_retries=num_retries, rate_limit_list=rate_limit_list, retention=retention, snapshot_parallel_list=snapshot_parallel_list, start_date=start_date, upload_parallel_list=upload_parallel_list, ) return wait_for(lambda: self.get_mgr_backup_task(so_task), step=2, timeout=300)
def wait_for_sstable_upgrade(self, node, queue=None): all_tables_upgraded = True def wait_for_node_to_finish(): try: result = node.remoter.run( "sudo find /var/lib/scylla/data/system -type f ! -path '*snapshots*' | xargs -I{} basename {}" ) all_sstable_files = result.stdout.splitlines() sstable_version_regex = re.compile( r'(\w+)-\d+-(.+)\.(db|txt|sha1|crc32)') sstable_versions = { sstable_version_regex.search(f).group(1) for f in all_sstable_files if sstable_version_regex.search(f) } assert len( sstable_versions ) == 1, "expected all table format to be the same found {}".format( sstable_versions) assert list( sstable_versions )[0] == self.expected_sstable_format_version, "expected to format version to be '{}', found '{}'".format( self.expected_sstable_format_version, list(sstable_versions)[0]) except Exception as ex: # pylint: disable=broad-except self.log.warning(ex) return False else: return True try: self.log.info("Start waiting for upgardesstables to finish") wait.wait_for(func=wait_for_node_to_finish, step=30, timeout=900, throw_exc=True, text="Waiting until upgardesstables is finished") except Exception: # pylint: disable=broad-except all_tables_upgraded = False finally: if queue: queue.put(all_tables_upgraded) queue.task_done()
def _run_stress_bench(self, node, loader_idx, stress_cmd, node_list): if self.sb_mode == ScyllaBenchModes.WRITE and self.sb_workload == ScyllaBenchWorkloads.TIMESERIES: node.parent_cluster.sb_write_timeseries_ts = write_timestamp = time.time_ns( ) LOGGER.debug("Set start-time: %s", write_timestamp) stress_cmd = re.sub(r"SET_WRITE_TIMESTAMP", f"{write_timestamp}", stress_cmd) LOGGER.debug("Replaced stress command: %s", stress_cmd) elif self.sb_mode == ScyllaBenchModes.READ and self.sb_workload == ScyllaBenchWorkloads.TIMESERIES: write_timestamp = wait_for( lambda: node.parent_cluster.sb_write_timeseries_ts, step=5, timeout=30, text= 'Waiting for "scylla-bench -workload=timeseries -mode=write" been started, to pick up timestamp' ) LOGGER.debug("Found write timestamp %s", write_timestamp) stress_cmd = re.sub(r"GET_WRITE_TIMESTAMP", f"{write_timestamp}", stress_cmd) LOGGER.debug("replaced stress command %s", stress_cmd) else: LOGGER.debug("Scylla bench command: %s", stress_cmd) os.makedirs(node.logdir, exist_ok=True) log_file_name = os.path.join( node.logdir, f'scylla-bench-l{loader_idx}-{uuid.uuid4()}.log') # Select first seed node to send the scylla-bench cmds ips = node_list[0].cql_ip_address with ScyllaBenchStressExporter(instance_name=node.cql_ip_address, metrics=nemesis_metrics_obj(), stress_operation=self.sb_mode, stress_log_filename=log_file_name, loader_idx=loader_idx), \ ScyllaBenchStressEventsPublisher(node=node, sb_log_filename=log_file_name) as publisher, \ ScyllaBenchEvent(node=node, stress_cmd=self.stress_cmd, log_file_name=log_file_name) as scylla_bench_event: publisher.event_id = scylla_bench_event.event_id result = None try: result = node.remoter.run( cmd="/$HOME/go/bin/{name} -nodes {ips}".format( name=stress_cmd.strip(), ips=ips), timeout=self.timeout, log_file=log_file_name) except Exception as exc: # pylint: disable=broad-except errors_str = format_stress_cmd_error(exc) if "truncate: seastar::rpc::timeout_error" in errors_str: scylla_bench_event.severity = Severity.ERROR elif self.stop_test_on_failure: scylla_bench_event.severity = Severity.CRITICAL else: scylla_bench_event.severity = Severity.ERROR scylla_bench_event.add_error([errors_str]) return node, result
def id(self): if self._id is None: self._id = wait_for(self.get_cluster_id_by_name, cluster_name=self.cluster_name, timeout=120, text='Waiting manager cluster to appear', throw_exc=True) return self._id
def destroy_nodegroups(self, status=None): def _destroy_attached_nodegroups(): for node_group_name in self._get_attached_nodegroup_names(status=status): try: self.eks_client.delete_nodegroup(clusterName=self.short_cluster_name, nodegroupName=node_group_name) except Exception as exc: # pylint: disable=broad-except LOGGER.debug("Failed to delete nodegroup %s/%s, due to the following error:\n%s", self.short_cluster_name, node_group_name, exc) time.sleep(10) return wait_for(lambda: not self._get_attached_nodegroup_names(status='DELETING'), text='Waiting till target nodegroups are deleted', step=10, timeout=300, throw_exc=False) wait_for(_destroy_attached_nodegroups, timeout=400, throw_exc=False)
def wait_for_resource_absence(db_cluster: ScyllaPodCluster, resource_type: str, resource_name: str, step: int = 2, timeout: int = 60) -> None: def resource_is_absent() -> bool: all_resources = db_cluster.k8s_cluster.kubectl( f"get {resource_type} -o=custom-columns=:.metadata.name", namespace=SCYLLA_NAMESPACE, ).stdout.split() return resource_name not in all_resources wait_for( resource_is_absent, step=step, timeout=timeout, throw_exc=True, text=f"Waiting for the '{resource_name}' {resource_type} be deleted")
def test_03_return_value(self): calls = [] def callback(arg1, arg2): calls.append((arg1, arg2)) return 'what ever' self.assertEqual(wait_for(callback, timeout=2, step=0.5, arg1=1, arg2=3), 'what ever') self.assertEqual(len(calls), 1)
def wait_for_operator_backup_task_status( self, so_backup_task: ScyllaOperatorBackupTask, timeout=120, step=1) -> ScyllaOperatorBackupTaskStatus: return wait_for( func=self.get_operator_backup_task_status, step=step, text=f"Waiting until operator backup task '{so_backup_task.name}' get it's status", timeout=timeout, task_name=so_backup_task.name, throw_exc=True, )
def create_eks_cluster(self, wait_till_functional=True): self.eks_client.create_cluster( name=self.short_cluster_name, version=self.eks_cluster_version, roleArn=self.ec2_role_arn, resourcesVpcConfig={ 'securityGroupIds': self.ec2_security_group_ids[0], 'subnetIds': self.ec2_subnet_ids, 'endpointPublicAccess': True, 'endpointPrivateAccess': True, 'publicAccessCidrs': [ '0.0.0.0/0', ] }, kubernetesNetworkConfig={ 'serviceIpv4Cidr': self.service_ipv4_cidr }, logging={ 'clusterLogging': [ { 'types': [ 'api', 'audit', 'authenticator', 'controllerManager', 'scheduler' ], 'enabled': True }, ] }, tags=self.tags, ) self.eks_client.create_addon(clusterName=self.short_cluster_name, addonName='vpc-cni', addonVersion=self.vpc_cni_version) if wait_till_functional: wait_for( lambda: self.cluster_status == 'ACTIVE', step=60, throw_exc=True, timeout=1200, text= f'Waiting till EKS cluster {self.short_cluster_name} become operational' )
def wait_for_operator_repair_task_status( self, so_repair_task: ScyllaOperatorRepairTask, timeout=120, step=1) -> ScyllaOperatorRepairTaskStatus: return wait_for( func=self.get_operator_repair_task_status, step=step, text=f"Waiting until operator repair task: {so_repair_task.name} get it's status", timeout=timeout, task_name=so_repair_task.name, throw_exc=True, )
def delete(self): try: LOGGER.debug("Trying to delete '%s' K8S service in the '%s' namespace", self.name, self.namespace) self.core_v1_api.delete_namespaced_service( name=self.name, namespace=self.namespace, async_req=False) except k8s.client.exceptions.ApiException as exc: LOGGER.debug("Failed to delete '%s' K8S service in the '%s' namespace, error:\n%s", self.name, self.namespace, str(exc)) if getattr(exc, 'body', None) is not None: if isinstance(exc.body, str): exc.body = json.loads(exc.body) if exc.body.get("reason") == "NotFound" or exc.body.get("code") == 404: LOGGER.debug("Could not find '%s' K8S service in the '%s' namespace trying " "to delete it. Ignoring.", self.name, self.namespace) return raise wait_for(self.is_deleted, step=4, timeout=180, throw_exc=True)