def _find_cluster_with_pagination(cmd_args, cluster_name): result = run_pcluster_command(cmd_args) response = json.loads(result.stdout) found_cluster = _find_cluster_in_list(cluster_name, response["clusters"]) while response.get("nextToken") and found_cluster is None: cmd_args_with_next_token = cmd_args + ["--next-token", response["nextToken"]] result = run_pcluster_command(cmd_args_with_next_token) response = json.loads(result.stdout) found_cluster = _find_cluster_in_list(cluster_name, response["clusters"]) return found_cluster
def delete(self, delete_logs=False): """Delete this cluster.""" if self.has_been_deleted: return cmd_args = ["pcluster", "delete-cluster", "--cluster-name", self.name, "--wait"] if delete_logs: logging.warning("Updating stack %s to delete CloudWatch logs on stack deletion.", self.name) try: dict_add_nested_key(self.config, "Delete", ("Monitoring", "Logs", "CloudWatch", "DeletionPolicy")) with open(self.config_file, "w", encoding="utf-8") as conf_file: yaml.dump(self.config, conf_file) self.update(self.config_file, force_update="true") except subprocess.CalledProcessError as e: logging.error( "Failed updating cluster to delete log with error:\n%s\nand output:\n%s", e.stderr, e.stdout ) raise else: logging.warning("CloudWatch logs for cluster %s are preserved due to failure.", self.name) try: self.cfn_stack_arn # Cache cfn_stack_arn attribute before stack deletion result = run_pcluster_command(cmd_args, log_error=False) if "DELETE_FAILED" in result.stdout: error = "Cluster deletion failed for {0} with output: {1}".format(self.name, result.stdout) logging.error(error) raise Exception(error) logging.info("Cluster {0} deleted successfully".format(self.name)) except subprocess.CalledProcessError as e: if re.search(f"Stack with id {self.name} does not exist", e.stdout): pass else: logging.error("Failed destroying cluster with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise self.has_been_deleted = True
def list_images(**kwargs): """List images.""" command = ["pcluster", "list-images"] for k, val in kwargs.items(): command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command) response = json.loads(result.stdout) return response
def create_cluster(self, cluster, log_error=True, raise_on_error=True, **kwargs): """ Create a cluster with a given config. :param cluster: cluster to create. :param log_error: log error when error occurs. This can be set to False when error is expected :param raise_on_error: raise exception if cluster creation fails :param kwargs: additional parameters to be passed to the pcluster command """ name = cluster.name config = cluster.config_file if name in self.__created_clusters: raise ValueError("Cluster {0} already exists".format(name)) # create the cluster logging.info("Creating cluster {0} with config {1}".format(name, config)) command = [ "pcluster", "create-cluster", "--rollback-on-failure", "false", "--cluster-configuration", config, "--cluster-name", name, ] wait = kwargs.pop("wait", True) if wait: command.append("--wait") for k, val in kwargs.items(): if isinstance(val, (list, tuple)): command.extend([f"--{kebab_case(k)}"] + list(map(str, val))) else: command.extend([f"--{kebab_case(k)}", str(val)]) try: result = run_pcluster_command(command, timeout=7200, raise_on_error=raise_on_error, log_error=log_error) logging.info("create-cluster response: %s", result.stdout) response = json.loads(result.stdout) if wait: if response.get("cloudFormationStackStatus") != "CREATE_COMPLETE": error = f"Cluster creation failed for {name}" logging.error(error) if raise_on_error: raise Exception(error) else: logging.info("Cluster {0} created successfully".format(name)) cluster.mark_as_created() else: logging.info("Cluster {0} creation started successfully".format(name)) return response finally: # Only add cluster to created_clusters if stack creation started try: if cluster.cfn_stack_arn: self.__created_clusters[name] = cluster except Exception: pass
def list_log_streams(self): """Get image build log streams.""" logging.info("Get image %s build log streams.", self.image_id) command = [ "pcluster", "list-image-log-streams", "--region", self.region, "--image-id", self.image_id ] result = run_pcluster_command(command).stdout response = json.loads(result) return response
def describe_cluster(self): """Run pcluster describe-cluster and return the result.""" cmd_args = ["pcluster", "describe-cluster", "--cluster-name", self.name] try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Get cluster {0} status successfully".format(self.name)) return response except subprocess.CalledProcessError as e: logging.error("Failed when getting cluster status with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def export_logs(self, **args): """Export the logs from the image build process.""" logging.info("Get image %s build log.", self.image_id) command = [ "pcluster", "export-image-logs", "--region", self.region, "--image-id", self.image_id ] for k, val in args.items(): command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command) return json.loads(result.stdout)
def list_log_streams(self): """Run pcluster list-cluster-logs and return the result.""" cmd_args = ["pcluster", "list-cluster-log-streams", "--cluster-name", self.name] try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Cluster's logs listed successfully") return response except subprocess.CalledProcessError as e: logging.error("Failed listing cluster's logs with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def get_stack_events(self, **args): """Get image build stack events.""" logging.info("Get image %s build log.", self.image_id) command = [ "pcluster", "get-image-stack-events", "--region", self.region, "--image-id", self.image_id ] for k, val in args.items(): command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command).stdout response = json.loads(result) return response
def describe_compute_fleet(self): """Run pcluster describe-compute-fleet and return the result.""" cmd_args = ["pcluster", "describe-compute-fleet", "--cluster-name", self.name] try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Describe cluster %s compute fleet successfully", self.name) return response except subprocess.CalledProcessError as e: logging.error( "Failed when getting cluster compute fleet with error:\n%s\nand output:\n%s", e.stderr, e.stdout ) raise
def get_stack_events(self, **args): """Run pcluster get-cluster-log-events and return the result.""" cmd_args = ["pcluster", "get-cluster-stack-events", "--cluster-name", self.name] for k, val in args.items(): cmd_args.extend([f"--{kebab_case(k)}", str(val)]) try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Stack events retrieved successfully") return response except subprocess.CalledProcessError as e: logging.error("Failed listing log events with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def stop(self): """Run pcluster stop and return the result.""" cmd_args = ["pcluster", "update-compute-fleet", "--cluster-name", self.name, "--status"] scheduler = self.config["Scheduling"]["Scheduler"] if scheduler == "slurm": cmd_args.append("STOP_REQUESTED") elif scheduler == "awsbatch": cmd_args.append("DISABLED") try: result = run_pcluster_command(cmd_args, log_error=False) logging.info("Cluster {0} stopped successfully".format(self.name)) return result.stdout except subprocess.CalledProcessError as e: logging.error("Failed stopping cluster with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def export_logs(self, bucket, output_file=None, bucket_prefix=None): """Run pcluster export-cluster-logs and return the result.""" cmd_args = ["pcluster", "export-cluster-logs", "--cluster-name", self.name, "--bucket", bucket] if output_file: cmd_args += ["--output-file", output_file] if bucket_prefix: cmd_args += ["--bucket-prefix", bucket_prefix] try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Cluster's logs exported successfully") return response except subprocess.CalledProcessError as e: logging.error("Failed exporting cluster's logs with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def delete(self, force=False): """Delete image.""" command = [ "pcluster", "delete-image", "--image-id", self.image_id, "--region", self.region ] if force: command.extend(["--force", "true"]) result = run_pcluster_command(command) response = json.loads(result.stdout) if "message" in response and response["message"].startswith( "No image or stack associated"): logging.error("Delete on non-existing image: %s", self.image_id) else: self._update_image_info(response) return response
def describe(self, log_on_error=False): """Describe image.""" logging.info("Describe image %s in region %s.", self.image_id, self.region) command = [ "pcluster", "describe-image", "--image-id", self.image_id, "--region", self.region ] result = run_pcluster_command(command).stdout response = json.loads(result) if "message" in response and response["message"].startswith( "No image or stack associated"): if log_on_error: logging.error("Describe on non-existing image: %s", self.image_id) else: self._update_image_info(response) return response
def get_log_events(self, log_stream_name, **args): """Get image build log events.""" logging.info("Get image %s build log.", self.image_id) command = [ "pcluster", "get-image-log-events", "--image-id", self.image_id, "--region", self.region, "--log-stream-name", log_stream_name, ] for k, val in args.items(): if val is not None: command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command).stdout response = json.loads(result) return response
def build(self, **kwargs): """Build image.""" raise_on_error = kwargs.pop("raise_on_error", True) log_error = kwargs.pop("log_error", True) command = [ "pcluster", "build-image", "--image-id", self.image_id, "--region", self.region, "--image-configuration", self.config_file, ] for k, val in kwargs.items(): command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command, raise_on_error=raise_on_error, log_error=log_error) response = json.loads(result.stdout) try: if response["image"]["imageBuildStatus"] == "BUILD_IN_PROGRESS": self._update_image_info(response["image"]) elif log_error: logging.error("Error building image: %s", response) except KeyError: if log_error: logging.error("Error building image: %s", result.stdout) if raise_on_error: raise if "configurationValidationErrors" in response: self.configuration_errors = response[ "configurationValidationErrors"] if "message" in response: self.message = response["message"] return response["image"] if "image" in response else response
def describe_cluster_instances(self, node_type=None, queue_name=None): """Run pcluster describe-cluster-instances and return the result""" cmd_args = ["pcluster", "describe-cluster-instances", "--cluster-name", self.name] if node_type: if node_type == "HeadNode": node_type = "HEAD" elif node_type == "Compute": node_type = "COMPUTE" else: raise ValueError cmd_args.extend(["--node-type", node_type]) if queue_name: cmd_args.extend(["--queue-name", queue_name]) try: result = run_pcluster_command(cmd_args, log_error=False) response = json.loads(result.stdout) logging.info("Get cluster {0} instances successfully".format(self.name)) return response["instances"] except subprocess.CalledProcessError as e: logging.error("Failed when getting cluster instances with error:\n%s\nand output:\n%s", e.stderr, e.stdout) raise
def update(self, config_file, raise_on_error=True, log_error=True, **kwargs): """ Update a cluster with an already updated config. :param raise_on_error: raise exception if cluster creation fails :param log_error: log error when error occurs. This can be set to False when error is expected :param kwargs: additional args that get passed to the pcluster command """ # update the cluster logging.info("Updating cluster %s with config %s", self.name, config_file) command = ["pcluster", "update-cluster", "--cluster-configuration", config_file, "--cluster-name", self.name] if kwargs.pop("wait", True): command.append("--wait") for k, val in kwargs.items(): if isinstance(val, (list, tuple)): command.extend([f"--{kebab_case(k)}"] + list(map(str, val))) else: command.extend([f"--{kebab_case(k)}", str(val)]) result = run_pcluster_command(command, raise_on_error=raise_on_error, log_error=log_error) logging.info("update-cluster response: %s", result.stdout) response = json.loads(result.stdout) if response.get("cloudFormationStackStatus") != "UPDATE_COMPLETE": error = f"Cluster update failed for {self.name}" if log_error: logging.error(error) if raise_on_error: raise Exception(error) logging.info("Cluster %s updated successfully", self.name) # Only update config file attribute if update is successful self.config_file = config_file with open(self.config_file, encoding="utf-8") as conf_file: self.config = yaml.safe_load(conf_file) # reset cached properties self._reset_cached_properties() return response
def _test_dcv_configuration(dcv_port, access_from, region, instance, os, scheduler, pcluster_config_reader, clusters_factory, test_datadir): dcv_authenticator_port = dcv_port + 1 cluster_config = pcluster_config_reader(dcv_port=str(dcv_port), access_from=access_from) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # check configuration parameters check_head_node_security_group(region, cluster, dcv_port, expected_cidr=access_from) # dcv connect show url env = operating_system.environ.copy() env["AWS_DEFAULT_REGION"] = region # add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt host_keys_file = operating_system.path.expanduser("~/.ssh/known_hosts") add_keys_to_known_hosts(cluster.head_node_ip, host_keys_file) try: result = run_pcluster_command([ "pcluster", "dcv-connect", "--cluster-name", cluster.name, "--show-url" ], env=env) finally: # remove ssh key from jenkins user known hosts file remove_keys_from_known_hosts(cluster.head_node_ip, host_keys_file, env=env) assert_that(result.stdout).matches( r"Please use the following one-time URL in your browser within 30 seconds:\n" r"https:\/\/(\b(?:\d{1,3}\.){3}\d{1,3}\b):" + str(dcv_port) + r"\?authToken=(.*)") # check error cases _check_auth_ko( remote_command_executor, dcv_authenticator_port, "-d action=requestToken -d authUser=centos -d sessionID=invalidSessionId", "The given session does not exists", ) _check_auth_ko(remote_command_executor, dcv_authenticator_port, "-d action=test", "The action specified 'test' is not valid") _check_auth_ko(remote_command_executor, dcv_authenticator_port, "-d action=requestToken -d authUser=centos", "Wrong parameters") _check_auth_ko(remote_command_executor, dcv_authenticator_port, "-d action=sessionToken -d authUser=centos", "Wrong parameters") shared_dir = f"/home/{get_username_for_os(os)}" # launch a session and verify the authenticator works command_execution = remote_command_executor.run_remote_command( f"{DCV_CONNECT_SCRIPT} {shared_dir}") dcv_parameters = re.search( r"PclusterDcvServerPort=([\d]+) PclusterDcvSessionId=([\w]+) PclusterDcvSessionToken=([\w-]+)", command_execution.stdout, ) if dcv_parameters: dcv_session_id = dcv_parameters.group(2) dcv_session_token = dcv_parameters.group(3) _check_auth_ok(remote_command_executor, dcv_authenticator_port, dcv_session_id, dcv_session_token, os) else: print("Command '{0} {1}' fails, output: {2}, error: {3}".format( DCV_CONNECT_SCRIPT, shared_dir, command_execution.stdout, command_execution.stderr)) raise AssertionError # check shared dir configuration _check_shared_dir(remote_command_executor, shared_dir) # Ensure no system programs crashed _check_no_crashes(remote_command_executor, test_datadir) # Check that logs are stored in CloudWatch as expected FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature( cluster, scheduler, os, "dcv_enabled", region, shared_dir)