Пример #1
0
def _find_cluster_with_pagination(cmd_args, cluster_name):
    result = run_pcluster_command(cmd_args)
    response = json.loads(result.stdout)
    found_cluster = _find_cluster_in_list(cluster_name, response["clusters"])
    while response.get("nextToken") and found_cluster is None:
        cmd_args_with_next_token = cmd_args + ["--next-token", response["nextToken"]]
        result = run_pcluster_command(cmd_args_with_next_token)
        response = json.loads(result.stdout)
        found_cluster = _find_cluster_in_list(cluster_name, response["clusters"])
    return found_cluster
 def delete(self, delete_logs=False):
     """Delete this cluster."""
     if self.has_been_deleted:
         return
     cmd_args = ["pcluster", "delete-cluster", "--cluster-name", self.name, "--wait"]
     if delete_logs:
         logging.warning("Updating stack %s to delete CloudWatch logs on stack deletion.", self.name)
         try:
             dict_add_nested_key(self.config, "Delete", ("Monitoring", "Logs", "CloudWatch", "DeletionPolicy"))
             with open(self.config_file, "w", encoding="utf-8") as conf_file:
                 yaml.dump(self.config, conf_file)
             self.update(self.config_file, force_update="true")
         except subprocess.CalledProcessError as e:
             logging.error(
                 "Failed updating cluster to delete log with error:\n%s\nand output:\n%s", e.stderr, e.stdout
             )
             raise
     else:
         logging.warning("CloudWatch logs for cluster %s are preserved due to failure.", self.name)
     try:
         self.cfn_stack_arn  # Cache cfn_stack_arn attribute before stack deletion
         result = run_pcluster_command(cmd_args, log_error=False)
         if "DELETE_FAILED" in result.stdout:
             error = "Cluster deletion failed for {0} with output: {1}".format(self.name, result.stdout)
             logging.error(error)
             raise Exception(error)
         logging.info("Cluster {0} deleted successfully".format(self.name))
     except subprocess.CalledProcessError as e:
         if re.search(f"Stack with id {self.name} does not exist", e.stdout):
             pass
         else:
             logging.error("Failed destroying cluster with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
             raise
     self.has_been_deleted = True
Пример #3
0
 def list_images(**kwargs):
     """List images."""
     command = ["pcluster", "list-images"]
     for k, val in kwargs.items():
         command.extend([f"--{kebab_case(k)}", str(val)])
     result = run_pcluster_command(command)
     response = json.loads(result.stdout)
     return response
    def create_cluster(self, cluster, log_error=True, raise_on_error=True, **kwargs):
        """
        Create a cluster with a given config.
        :param cluster: cluster to create.
        :param log_error: log error when error occurs. This can be set to False when error is expected
        :param raise_on_error: raise exception if cluster creation fails
        :param kwargs: additional parameters to be passed to the pcluster command
        """
        name = cluster.name
        config = cluster.config_file
        if name in self.__created_clusters:
            raise ValueError("Cluster {0} already exists".format(name))

        # create the cluster
        logging.info("Creating cluster {0} with config {1}".format(name, config))
        command = [
            "pcluster",
            "create-cluster",
            "--rollback-on-failure",
            "false",
            "--cluster-configuration",
            config,
            "--cluster-name",
            name,
        ]
        wait = kwargs.pop("wait", True)
        if wait:
            command.append("--wait")
        for k, val in kwargs.items():
            if isinstance(val, (list, tuple)):
                command.extend([f"--{kebab_case(k)}"] + list(map(str, val)))
            else:
                command.extend([f"--{kebab_case(k)}", str(val)])
        try:
            result = run_pcluster_command(command, timeout=7200, raise_on_error=raise_on_error, log_error=log_error)

            logging.info("create-cluster response: %s", result.stdout)
            response = json.loads(result.stdout)
            if wait:
                if response.get("cloudFormationStackStatus") != "CREATE_COMPLETE":
                    error = f"Cluster creation failed for {name}"
                    logging.error(error)
                    if raise_on_error:
                        raise Exception(error)
                else:
                    logging.info("Cluster {0} created successfully".format(name))
                    cluster.mark_as_created()
            else:
                logging.info("Cluster {0} creation started successfully".format(name))
            return response
        finally:
            # Only add cluster to created_clusters if stack creation started
            try:
                if cluster.cfn_stack_arn:
                    self.__created_clusters[name] = cluster
            except Exception:
                pass
Пример #5
0
 def list_log_streams(self):
     """Get image build log streams."""
     logging.info("Get image %s build log streams.", self.image_id)
     command = [
         "pcluster", "list-image-log-streams", "--region", self.region,
         "--image-id", self.image_id
     ]
     result = run_pcluster_command(command).stdout
     response = json.loads(result)
     return response
 def describe_cluster(self):
     """Run pcluster describe-cluster and return the result."""
     cmd_args = ["pcluster", "describe-cluster", "--cluster-name", self.name]
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         response = json.loads(result.stdout)
         logging.info("Get cluster {0} status successfully".format(self.name))
         return response
     except subprocess.CalledProcessError as e:
         logging.error("Failed when getting cluster status with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
         raise
Пример #7
0
 def export_logs(self, **args):
     """Export the logs from the  image build process."""
     logging.info("Get image %s build log.", self.image_id)
     command = [
         "pcluster", "export-image-logs", "--region", self.region,
         "--image-id", self.image_id
     ]
     for k, val in args.items():
         command.extend([f"--{kebab_case(k)}", str(val)])
     result = run_pcluster_command(command)
     return json.loads(result.stdout)
 def list_log_streams(self):
     """Run pcluster list-cluster-logs and return the result."""
     cmd_args = ["pcluster", "list-cluster-log-streams", "--cluster-name", self.name]
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         response = json.loads(result.stdout)
         logging.info("Cluster's logs listed successfully")
         return response
     except subprocess.CalledProcessError as e:
         logging.error("Failed listing cluster's logs with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
         raise
Пример #9
0
 def get_stack_events(self, **args):
     """Get image build stack events."""
     logging.info("Get image %s build log.", self.image_id)
     command = [
         "pcluster", "get-image-stack-events", "--region", self.region,
         "--image-id", self.image_id
     ]
     for k, val in args.items():
         command.extend([f"--{kebab_case(k)}", str(val)])
     result = run_pcluster_command(command).stdout
     response = json.loads(result)
     return response
 def describe_compute_fleet(self):
     """Run pcluster describe-compute-fleet and return the result."""
     cmd_args = ["pcluster", "describe-compute-fleet", "--cluster-name", self.name]
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         response = json.loads(result.stdout)
         logging.info("Describe cluster %s compute fleet successfully", self.name)
         return response
     except subprocess.CalledProcessError as e:
         logging.error(
             "Failed when getting cluster compute fleet with error:\n%s\nand output:\n%s", e.stderr, e.stdout
         )
         raise
    def get_stack_events(self, **args):
        """Run pcluster get-cluster-log-events and return the result."""
        cmd_args = ["pcluster", "get-cluster-stack-events", "--cluster-name", self.name]
        for k, val in args.items():
            cmd_args.extend([f"--{kebab_case(k)}", str(val)])

        try:
            result = run_pcluster_command(cmd_args, log_error=False)
            response = json.loads(result.stdout)
            logging.info("Stack events retrieved successfully")
            return response
        except subprocess.CalledProcessError as e:
            logging.error("Failed listing log events with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
            raise
 def stop(self):
     """Run pcluster stop and return the result."""
     cmd_args = ["pcluster", "update-compute-fleet", "--cluster-name", self.name, "--status"]
     scheduler = self.config["Scheduling"]["Scheduler"]
     if scheduler == "slurm":
         cmd_args.append("STOP_REQUESTED")
     elif scheduler == "awsbatch":
         cmd_args.append("DISABLED")
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         logging.info("Cluster {0} stopped successfully".format(self.name))
         return result.stdout
     except subprocess.CalledProcessError as e:
         logging.error("Failed stopping cluster with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
         raise
Пример #13
0
 def export_logs(self, bucket, output_file=None, bucket_prefix=None):
     """Run pcluster export-cluster-logs and return the result."""
     cmd_args = ["pcluster", "export-cluster-logs", "--cluster-name", self.name, "--bucket", bucket]
     if output_file:
         cmd_args += ["--output-file", output_file]
     if bucket_prefix:
         cmd_args += ["--bucket-prefix", bucket_prefix]
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         response = json.loads(result.stdout)
         logging.info("Cluster's logs exported successfully")
         return response
     except subprocess.CalledProcessError as e:
         logging.error("Failed exporting cluster's logs with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
         raise
Пример #14
0
 def delete(self, force=False):
     """Delete image."""
     command = [
         "pcluster", "delete-image", "--image-id", self.image_id,
         "--region", self.region
     ]
     if force:
         command.extend(["--force", "true"])
     result = run_pcluster_command(command)
     response = json.loads(result.stdout)
     if "message" in response and response["message"].startswith(
             "No image or stack associated"):
         logging.error("Delete on non-existing image: %s", self.image_id)
     else:
         self._update_image_info(response)
     return response
Пример #15
0
 def describe(self, log_on_error=False):
     """Describe image."""
     logging.info("Describe image %s in region %s.", self.image_id,
                  self.region)
     command = [
         "pcluster", "describe-image", "--image-id", self.image_id,
         "--region", self.region
     ]
     result = run_pcluster_command(command).stdout
     response = json.loads(result)
     if "message" in response and response["message"].startswith(
             "No image or stack associated"):
         if log_on_error:
             logging.error("Describe on non-existing image: %s",
                           self.image_id)
     else:
         self._update_image_info(response)
     return response
Пример #16
0
 def get_log_events(self, log_stream_name, **args):
     """Get image build log events."""
     logging.info("Get image %s build log.", self.image_id)
     command = [
         "pcluster",
         "get-image-log-events",
         "--image-id",
         self.image_id,
         "--region",
         self.region,
         "--log-stream-name",
         log_stream_name,
     ]
     for k, val in args.items():
         if val is not None:
             command.extend([f"--{kebab_case(k)}", str(val)])
     result = run_pcluster_command(command).stdout
     response = json.loads(result)
     return response
Пример #17
0
    def build(self, **kwargs):
        """Build image."""
        raise_on_error = kwargs.pop("raise_on_error", True)
        log_error = kwargs.pop("log_error", True)

        command = [
            "pcluster",
            "build-image",
            "--image-id",
            self.image_id,
            "--region",
            self.region,
            "--image-configuration",
            self.config_file,
        ]

        for k, val in kwargs.items():
            command.extend([f"--{kebab_case(k)}", str(val)])

        result = run_pcluster_command(command,
                                      raise_on_error=raise_on_error,
                                      log_error=log_error)
        response = json.loads(result.stdout)
        try:
            if response["image"]["imageBuildStatus"] == "BUILD_IN_PROGRESS":
                self._update_image_info(response["image"])
            elif log_error:
                logging.error("Error building image: %s", response)
        except KeyError:
            if log_error:
                logging.error("Error building image: %s", result.stdout)
            if raise_on_error:
                raise

        if "configurationValidationErrors" in response:
            self.configuration_errors = response[
                "configurationValidationErrors"]

        if "message" in response:
            self.message = response["message"]

        return response["image"] if "image" in response else response
 def describe_cluster_instances(self, node_type=None, queue_name=None):
     """Run pcluster describe-cluster-instances and return the result"""
     cmd_args = ["pcluster", "describe-cluster-instances", "--cluster-name", self.name]
     if node_type:
         if node_type == "HeadNode":
             node_type = "HEAD"
         elif node_type == "Compute":
             node_type = "COMPUTE"
         else:
             raise ValueError
         cmd_args.extend(["--node-type", node_type])
     if queue_name:
         cmd_args.extend(["--queue-name", queue_name])
     try:
         result = run_pcluster_command(cmd_args, log_error=False)
         response = json.loads(result.stdout)
         logging.info("Get cluster {0} instances successfully".format(self.name))
         return response["instances"]
     except subprocess.CalledProcessError as e:
         logging.error("Failed when getting cluster instances with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
         raise
    def update(self, config_file, raise_on_error=True, log_error=True, **kwargs):
        """
        Update a cluster with an already updated config.
        :param raise_on_error: raise exception if cluster creation fails
        :param log_error: log error when error occurs. This can be set to False when error is expected
        :param kwargs: additional args that get passed to the pcluster command
        """
        # update the cluster
        logging.info("Updating cluster %s with config %s", self.name, config_file)
        command = ["pcluster", "update-cluster", "--cluster-configuration", config_file, "--cluster-name", self.name]
        if kwargs.pop("wait", True):
            command.append("--wait")
        for k, val in kwargs.items():
            if isinstance(val, (list, tuple)):
                command.extend([f"--{kebab_case(k)}"] + list(map(str, val)))
            else:
                command.extend([f"--{kebab_case(k)}", str(val)])
        result = run_pcluster_command(command, raise_on_error=raise_on_error, log_error=log_error)
        logging.info("update-cluster response: %s", result.stdout)
        response = json.loads(result.stdout)
        if response.get("cloudFormationStackStatus") != "UPDATE_COMPLETE":
            error = f"Cluster update failed for {self.name}"
            if log_error:
                logging.error(error)
            if raise_on_error:
                raise Exception(error)
        logging.info("Cluster %s updated successfully", self.name)
        # Only update config file attribute if update is successful
        self.config_file = config_file
        with open(self.config_file, encoding="utf-8") as conf_file:
            self.config = yaml.safe_load(conf_file)

        # reset cached properties
        self._reset_cached_properties()

        return response
Пример #20
0
def _test_dcv_configuration(dcv_port, access_from, region, instance, os,
                            scheduler, pcluster_config_reader,
                            clusters_factory, test_datadir):
    dcv_authenticator_port = dcv_port + 1
    cluster_config = pcluster_config_reader(dcv_port=str(dcv_port),
                                            access_from=access_from)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # check configuration parameters
    check_head_node_security_group(region,
                                   cluster,
                                   dcv_port,
                                   expected_cidr=access_from)

    # dcv connect show url
    env = operating_system.environ.copy()
    env["AWS_DEFAULT_REGION"] = region

    # add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt
    host_keys_file = operating_system.path.expanduser("~/.ssh/known_hosts")
    add_keys_to_known_hosts(cluster.head_node_ip, host_keys_file)

    try:
        result = run_pcluster_command([
            "pcluster", "dcv-connect", "--cluster-name", cluster.name,
            "--show-url"
        ],
                                      env=env)
    finally:
        # remove ssh key from jenkins user known hosts file
        remove_keys_from_known_hosts(cluster.head_node_ip,
                                     host_keys_file,
                                     env=env)

    assert_that(result.stdout).matches(
        r"Please use the following one-time URL in your browser within 30 seconds:\n"
        r"https:\/\/(\b(?:\d{1,3}\.){3}\d{1,3}\b):" + str(dcv_port) +
        r"\?authToken=(.*)")

    # check error cases
    _check_auth_ko(
        remote_command_executor,
        dcv_authenticator_port,
        "-d action=requestToken -d authUser=centos -d sessionID=invalidSessionId",
        "The given session does not exists",
    )
    _check_auth_ko(remote_command_executor, dcv_authenticator_port,
                   "-d action=test",
                   "The action specified 'test' is not valid")
    _check_auth_ko(remote_command_executor, dcv_authenticator_port,
                   "-d action=requestToken -d authUser=centos",
                   "Wrong parameters")
    _check_auth_ko(remote_command_executor, dcv_authenticator_port,
                   "-d action=sessionToken -d authUser=centos",
                   "Wrong parameters")

    shared_dir = f"/home/{get_username_for_os(os)}"

    # launch a session and verify the authenticator works
    command_execution = remote_command_executor.run_remote_command(
        f"{DCV_CONNECT_SCRIPT} {shared_dir}")
    dcv_parameters = re.search(
        r"PclusterDcvServerPort=([\d]+) PclusterDcvSessionId=([\w]+) PclusterDcvSessionToken=([\w-]+)",
        command_execution.stdout,
    )
    if dcv_parameters:
        dcv_session_id = dcv_parameters.group(2)
        dcv_session_token = dcv_parameters.group(3)
        _check_auth_ok(remote_command_executor, dcv_authenticator_port,
                       dcv_session_id, dcv_session_token, os)
    else:
        print("Command '{0} {1}' fails, output: {2}, error: {3}".format(
            DCV_CONNECT_SCRIPT, shared_dir, command_execution.stdout,
            command_execution.stderr))
        raise AssertionError

    # check shared dir configuration
    _check_shared_dir(remote_command_executor, shared_dir)

    # Ensure no system programs crashed
    _check_no_crashes(remote_command_executor, test_datadir)

    # Check that logs are stored in CloudWatch as expected
    FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
        cluster, scheduler, os, "dcv_enabled", region, shared_dir)