Пример #1
0
def get_compute_nodes_allocation(scheduler_commands, region, stack_name, max_monitoring_time):
    """
    Watch periodically the number of compute nodes in the cluster.

    :return: (asg_capacity_time_series, compute_nodes_time_series, timestamps): three lists describing
        the variation over time in the number of compute nodes and the timestamp when these fluctuations occurred.
        asg_capacity_time_series describes the variation in the desired asg capacity. compute_nodes_time_series
        describes the variation in the number of compute nodes seen by the scheduler. timestamps describes the
        time since epoch when the variations occurred.
    """
    asg_capacity_time_series = []
    compute_nodes_time_series = []
    timestamps = []

    @retry(
        # Retry until ASG and Scheduler capacities scale down to 0
        # Also make sure cluster scaled up before scaling down
        retry_on_result=lambda _: asg_capacity_time_series[-1] != 0
        or compute_nodes_time_series[-1] != 0
        or max(asg_capacity_time_series) == 0
        or max(compute_nodes_time_series) == 0,
        wait_fixed=seconds(20),
        stop_max_delay=max_monitoring_time,
    )
    def _watch_compute_nodes_allocation():
        compute_nodes = scheduler_commands.compute_nodes_count()
        asg_capacity = _get_desired_asg_capacity(region, stack_name)
        timestamp = time.time()

        # add values only if there is a transition.
        if (
            len(asg_capacity_time_series) == 0
            or asg_capacity_time_series[-1] != asg_capacity
            or compute_nodes_time_series[-1] != compute_nodes
        ):
            asg_capacity_time_series.append(asg_capacity)
            compute_nodes_time_series.append(compute_nodes)
            timestamps.append(timestamp)

    try:
        _watch_compute_nodes_allocation()
    except RetryError:
        # ignoring this error in order to perform assertions on the collected data.
        pass

    logging.info(
        "Monitoring completed: %s, %s, %s",
        "asg_capacity_time_series [" + " ".join(map(str, asg_capacity_time_series)) + "]",
        "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]",
        "timestamps [" + " ".join(map(str, timestamps)) + "]",
    )
    return asg_capacity_time_series, compute_nodes_time_series, timestamps
Пример #2
0
class TorqueCommands(SchedulerCommands):
    """Implement commands for torque scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: "job_state = C" not in result,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(12))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(job_id))
        return result.stdout

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(job_id))
        match = re.search(r"exit_status = (\d+)", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, qsub_output):  # noqa: D102
        __tracebackhide__ = True
        # qsub_output is the id of the job in case of successful submissions
        id = qsub_output
        # check that the job exists
        self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(id))
        return id

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       after_ok=None):  # noqa: D102
        flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1)
        if after_ok:
            flags += " -W depend=afterok:{0}".format(after_ok)
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub {1}".format(command, flags),
            raise_on_error=False)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        script_name = os.path.basename(script)
        additional_files.append(script)
        flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1)
        if script_args:
            flags += ' -F "{0}"'.format(" ".join(script_args))
        return self._remote_command_executor.run_remote_command(
            "qsub {0} {1}".format(flags, script_name),
            additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "echo $(( $(/opt/torque/bin/pbsnodes -l all | wc -l) - 1))")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "pbsnodes -l all | grep -v $(hostname) | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(retry_on_result=lambda result: "offline" not in result,
           wait_fixed=seconds(5),
           stop_max_delay=minutes(5))
    def wait_for_locked_node(self):  # noqa: D102
        # discard the first node since that is the master server
        return self._remote_command_executor.run_remote_command(
            r'pbsnodes | grep -e "\sstate = " | tail -n +2').stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "pbsnodes | tail -n +10")
        return re.search(r"np = (\d+)", result.stdout).group(1)
Пример #3
0
class SlurmCommands(SchedulerCommands):
    """Implement commands for slurm scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(
        retry_on_result=lambda result: "JobState" not in result or
        any(value in result for value in
            ["EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING"]),
        wait_fixed=seconds(3),
        stop_max_delay=minutes(7),
    )
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id), raise_on_error=False)
        return result.stdout

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"ExitCode=(.+?) ", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, sbatch_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       host=None,
                       after_ok=None,
                       other_options=None):  # noqa: D102
        submission_command = "sbatch --wrap='{0}'".format(command)
        if nodes > 0:
            submission_command += "  -N {0}".format(nodes)
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if after_ok:
            submission_command += " -d afterok:{0}".format(after_ok)
        if other_options:
            submission_command += " {0}".format(other_options)
        return self._remote_command_executor.run_remote_command(
            submission_command)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      host=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        if not script_args:
            script_args = []
        additional_files.append(script)
        script_name = os.path.basename(script)
        submission_command = "sbatch"
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if nodes > 1:
            submission_command += " -N {0}".format(nodes)
        submission_command += " {1} {2}".format(nodes, script_name,
                                                " ".join(script_args))
        return self._remote_command_executor.run_remote_command(
            submission_command, additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        assert_that(result.stdout).contains("JobState=COMPLETED")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(retry_on_result=lambda result: "drain" not in result,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(5))
    def wait_for_locked_node(self):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -h -o '%t'").stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -o '%c' -h")
        return re.search(r"(\d+)", result.stdout).group(1)

    def get_job_info(self, job_id):
        """Return job details from slurm"""
        return self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id)).stdout
Пример #4
0
class SgeCommands(SchedulerCommands):
    """Implement commands for sge scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result != 0,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(7))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id), raise_on_error=False)
        return result.return_code

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id))
        match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
        assert_that(match).is_not_none()
        return match.group(1)

    def assert_job_submitted(self, qsub_output, is_array=False):  # noqa: D102
        __tracebackhide__ = True
        if is_array:
            regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted"
        else:
            regex = r"Your job ([0-9]+) \(.+\) has been submitted"
        match = re.search(regex, qsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       hold=False,
                       after_ok=None):  # noqa: D102
        flags = ""
        if nodes > 1:
            slots = nodes * slots
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        if hold:
            flags += "-h "
        if after_ok:
            flags += "-hold_jid {0} ".format(after_ok)
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub {1}".format(command, flags),
            raise_on_error=False)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        if not script_args:
            script_args = []
        additional_files.append(script)
        flags = ""
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        script_name = os.path.basename(script)
        return self._remote_command_executor.run_remote_command(
            "qsub {0} {1} {2}".format(flags, script_name,
                                      " ".join(script_args)),
            additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep -o ip- | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep ip- | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(
        retry_on_result=lambda result: "<state>d</state>" not in result,
        wait_fixed=seconds(3),
        stop_max_delay=minutes(5),
    )
    def wait_for_locked_node(self):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "qstat -f -xml").stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "qhost -F | grep hl:m_core")
        return re.search(r"hl:m_core=(\d+).000000", result.stdout).group(1)
class EBSSnapshotsFactory:
    """Manage creation and destruction of volume snapshots."""
    def __init__(self):
        self.config = None
        self.instance = None
        self.volume = None
        self.snapshot = None
        self.security_group_id = None
        self.ec2 = None
        self.boto_client = None

    def create_snapshot(self, request, subnet_id, region):
        """
        Create a snapshot in a given region.
        :param request: The current request
        :param subnet_id: The subnet id where to get the snapshot
        :param region: The region where to get the snapshot
        """
        # Only one snapshot creation per factory allowed
        if self.snapshot:
            raise Exception("Snapshot already created")

        self.ec2 = boto3.resource("ec2", region_name=region)
        self.boto_client = boto3.client("ec2", region_name=region)

        snapshot_config = SnapshotConfig(
            request.config.getoption("key_path"),
            request.config.getoption("key_name"),
            self.ec2.Subnet(subnet_id).vpc_id,
            subnet_id,
        )
        self.snapshot = self._create_snapshot(region, snapshot_config)
        return self.snapshot.id

    def create_existing_volume(self, request, subnet_id, region):
        """
        Create a volume in a given region.
        :param request: The current request
        :param subnet_id: The subnet id where to get the snapshot
        :param region: The region where to get the snapshot
        """
        # Only one volume creation per factory allowed
        if self.volume:
            raise Exception("Volume already created")

        self.ec2 = boto3.resource("ec2", region_name=region)
        self.boto_client = boto3.client("ec2", region_name=region)
        volume_config = SnapshotConfig(
            request.config.getoption("key_path"),
            request.config.getoption("key_name"),
            self.ec2.Subnet(subnet_id).vpc_id,
            subnet_id,
        )
        self._create_volume_process(region, volume_config)
        return self.volume.id

    def _create_volume_process(self, region, snapshot_config):
        self.config = snapshot_config
        ami_id = self._get_amazonlinux2_ami()

        self.security_group_id = self._get_security_group_id()

        subnet = self.ec2.Subnet(self.config.head_node_subnet_id)

        # Create a new volume and attach to the instance
        self.volume = self._create_volume(subnet)
        self.instance = self._launch_instance(ami_id, subnet)
        self._attach_volume()
        # Open ssh connection
        self.ssh_conn = self._open_ssh_connection()

        # Partitions the disk with a gpt table and 1 single partition inside
        self._format_volume(self.ssh_conn)

        # Stops the instance before taking the snapshot
        self._release_instance()

    def _create_snapshot(self, region, snapshot_config):
        self._create_volume_process(region, snapshot_config)
        self.snapshot = self._create_volume_snapshot()
        return self.snapshot

    def _create_volume_snapshot(self):
        logging.info("creating snapshot...")
        snapshot = self.ec2.create_snapshot(
            Description="parallelcluster-test-snapshot",
            VolumeId=self.volume.id)
        while snapshot.state == "pending":
            time.sleep(10)
            snapshot = self.ec2.Snapshot(snapshot.id)
        logging.info("Snapshot ready: %s" % snapshot.id)
        return snapshot

    def _format_volume(self, ssh_conn):
        logging.info("Partitioning device...")
        ssh_conn.run(
            "sudo sh -c 'echo -e \"g\nn\np\n1\n\n\nw\" | fdisk /dev/sdf'",
            warn=True,
            pty=False,
            hide=False)
        # Finds out the device name of the volume
        logging.info("Finding device name...")
        device_name = ssh_conn.run("readlink -f /dev/sdf").stdout.strip()
        # formats the 1st partition of disk
        logging.info("Formatting 1st partition...")
        ssh_conn.run("sudo sh -c 'mkfs.ext4 {}1'".format(device_name))
        logging.info("Mounting partition...")
        ssh_conn.run("sudo mkdir /mnt/tmp")
        ssh_conn.run("sudo mount {}1 /mnt/tmp".format(device_name))
        logging.info("Writing test data...")
        ssh_conn.run("echo 'hello world' | sudo tee -a /mnt/tmp/test.txt")
        logging.info("Device ready")

    def _open_ssh_connection(self):
        tries = 5
        logging.info("Connecting to instance %s " %
                     self.instance.public_ip_address)
        logging.info("ssh_key: %s " % self.config.ssh_key)
        ssh_conn = None

        while tries > 0:
            try:
                ssh_conn = Connection(
                    host=self.instance.public_ip_address,
                    user="******",
                    forward_agent=False,
                    connect_kwargs={"key_filename": [self.config.ssh_key]},
                )
                ssh_conn.open()
                tries = 0
            except BaseException:
                logging.info("SSH connection error - retrying...")
                tries -= 1
                time.sleep(20)

        if (ssh_conn is None) or (not ssh_conn.is_connected):
            raise ConnectionError()
        return ssh_conn

    @retry(retry_on_result=lambda state: state != "attached",
           wait_fixed=seconds(2),
           stop_max_delay=minutes(5))
    def _wait_volume_attached(self):
        vol = self.ec2.Volume(self.volume.id)
        attachment_state = next(
            (attachment["State"] for attachment in vol.attachments
             if attachment["InstanceId"] == self.instance.id), "")
        return attachment_state

    def _attach_volume(self):
        result = self.volume.attach_to_instance(InstanceId=self.instance.id,
                                                Device="/dev/sdf")
        logging.info("Attach Volume Result: %s", result)
        self._wait_volume_attached()
        logging.info("Volume attached")

    def _create_volume(self, subnet):
        vol = self.ec2.create_volume(
            Size=10,
            Encrypted=False,
            AvailabilityZone=subnet.availability_zone,
            TagSpecifications=[{
                "ResourceType":
                "volume",
                "Tags": [{
                    "Key": "name",
                    "Value": "parallel-cluster-test-volume"
                }]
            }],
        )
        logging.info("Volume Id: %s" % vol.id)
        # We can check if the volume is now ready and available:
        logging.info("Waiting for the volume to be ready...")
        while vol.state == "creating":
            vol = self.ec2.Volume(vol.id)
            time.sleep(2)
        logging.info("Volume ready")
        return vol

    def _get_security_group_id(self):
        security_group_id = self.boto_client.create_security_group(
            Description="security group for snapshot instance node",
            GroupName="snapshot-" + random_alphanumeric(),
            VpcId=self.config.vpc_id,
        )["GroupId"]

        self.boto_client.authorize_security_group_ingress(
            GroupId=security_group_id,
            IpPermissions=[{
                "IpProtocol": "tcp",
                "FromPort": 22,
                "ToPort": 22,
                "IpRanges": [{
                    "CidrIp": "0.0.0.0/0"
                }]
            }],
        )

        return security_group_id

    def _launch_instance(self, ami_id, subnet):
        instance = self.ec2.create_instances(
            ImageId=ami_id,
            KeyName=self.config.key_name,
            MinCount=1,
            MaxCount=1,
            InstanceType="t2.micro",
            NetworkInterfaces=[{
                "SubnetId": subnet.id,
                "DeviceIndex": 0,
                "AssociatePublicIpAddress": True,
                "Groups": [self.security_group_id],
            }],
            TagSpecifications=[{
                "ResourceType":
                "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": "pcluster-snapshot-instance"
                }]
            }],
        )[0]
        logging.info("Waiting for instance to be running...")
        while instance.state["Name"] == "pending":
            time.sleep(10)
            instance = self.ec2.Instance(instance.id)

        logging.info("Instance state: %s" % instance.state)
        logging.info("Public dns: %s" % instance.public_dns_name)
        return instance

    def _get_amazonlinux2_ami(self):
        # Finds most recent alinux2 ami in region
        response = self.boto_client.describe_images(
            Owners=["amazon"],
            Filters=[
                {
                    "Name": "name",
                    "Values": ["amzn2-ami-hvm-*"]
                },
                {
                    "Name": "description",
                    "Values": ["Amazon Linux 2 AMI*"]
                },
                {
                    "Name": "architecture",
                    "Values": ["x86_64"]
                },
                {
                    "Name": "root-device-type",
                    "Values": ["ebs"]
                },
                {
                    "Name": "state",
                    "Values": ["available"]
                },
            ],
        )

        amis = sorted(response["Images"],
                      key=lambda x: x["CreationDate"],
                      reverse=True)
        return amis[0]["ImageId"]

    def release_all(self):
        """Release all resources"""
        self._release_instance()
        self._release_volume()
        self._release_snapshot()
        self._release_security_group()

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_snapshot(self):
        if self.snapshot:
            logging.info("Deleting snapshot %s" % self.snapshot.id)
            self.snapshot.delete()

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_instance(self):
        if self.instance:
            self.instance.terminate()
            logging.info("Waiting for instance to be terminated...")
            while self.instance.state["Name"] != "terminated":
                time.sleep(10)
                self.instance = self.ec2.Instance(self.instance.id)
            logging.info("Instance terminated")
        self.instance = None

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_volume(self):
        if self.volume:
            logging.info("Deleting volume %s" % self.volume.id)
            self.volume.delete()
        self.volume = None

    def _release_security_group(self):
        if self.security_group_id:
            logging.info("Deleting security group %s" % self.security_group_id)
            self.boto_client.delete_security_group(
                GroupId=self.security_group_id)
        self.security_group_id = None
    _wait_instance_running(ec2_client, [instance_id])
    # Wait instance to complete cloud-init
    _wait_compute_cloudinit_done(command_executor, compute_node)

    return compute_node


def _wait_instance_running(ec2_client, instance_ids):
    """Wait EC2 instance to go running"""
    logging.info(f"Waiting for {instance_ids} to be running")
    ec2_client.get_waiter("instance_running").wait(
        InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 5}
    )


@retry(wait_fixed=seconds(10), stop_max_delay=minutes(3))
def _wait_compute_cloudinit_done(command_executor, compute_node):
    """Wait till cloud-init complete on a given compute node"""
    compute_node_private_ip = compute_node.get("privateIpAddress")
    compute_cloudinit_status_output = command_executor.run_remote_command(
        f"ssh -q {compute_node_private_ip} sudo cloud-init status"
    ).stdout
    assert_that(compute_cloudinit_status_output).contains("status: done")


def _test_event_handler_execution(cluster, region, os, architecture, command_executor, head_node, compute_node):
    """Test event handler execution and environment"""
    head_scheduler_plugin_log_output = command_executor.run_remote_command(
        f"cat {SCHEDULER_PLUGIN_LOG_OUT_PATH}"
    ).stdout
    python_root = command_executor.run_remote_command(f"sudo su - {SCHEDULER_PLUGIN_USER} -c 'which python'").stdout[
    remote_command_executor.run_remote_command(
        "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export"
        .format(bucket_name=bucket_name))
    result = remote_command_executor.run_remote_command("cat ./file_to_export")
    assert_that(result.stdout).is_equal_to("Exported by FSx Lustre")


def _assert_job_submitted(qsub_output):
    __tracebackhide__ = True
    match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted",
                      qsub_output)
    assert_that(match).is_not_none()
    return match.group(1)


@retry(retry_on_result=lambda result: result != 0,
       wait_fixed=seconds(7),
       stop_max_delay=minutes(5))
def _wait_job_completed(remote_command_executor, job_id):
    result = remote_command_executor.run_remote_command(
        "qacct -j {0}".format(job_id), raise_on_error=False)
    return result.return_code


def _get_job_exit_status(remote_command_executor, job_id):
    result = remote_command_executor.run_remote_command(
        "qacct -j {0}".format(job_id))
    match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
    assert_that(match).is_not_none()
    return match.group(1)
Пример #8
0
        raise
    except AttributeError as e:
        LOGGER.critical("Error no attribute {0} in dict: {1}".format(os, e))
        raise
    except IndexError as e:
        LOGGER.critical("Error no ami retrieved: {0}".format(e))
        raise


@retry(stop_max_attempt_number=3, wait_fixed=5000)
def fetch_instance_slots(region, instance_type):
    return get_instance_info(instance_type,
                             region).get("VCpuInfo").get("DefaultVCpus")


@retry(stop_max_attempt_number=10, wait_fixed=seconds(50))
def _assert_ami_is_available(region, ami_id):
    LOGGER.info("Asserting the ami is available")
    ami_state = boto3.client("ec2", region_name=region).describe_images(
        ImageIds=[ami_id]).get("Images")[0].get("State")
    assert_that(ami_state).is_equal_to("available")


def get_installed_parallelcluster_version():
    """Get the version of the installed aws-parallelcluster package."""
    return pkg_resources.get_distribution("aws-parallelcluster").version


def get_sts_endpoint(region):
    """Get regionalized STS endpoint."""
    return "https://sts.{0}.{1}".format(
Пример #9
0
    scheduler_commands.submit_script(
        str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)))
    instance_id = wait_compute_log(remote_command_executor)

    _assert_compute_logs(remote_command_executor, instance_id)
    assert_instance_replaced_or_terminating(instance_id, region)
    # verify that desired capacity is still 1
    assert_that(get_desired_asg_capacity(region,
                                         cluster.cfn_name)).is_equal_to(1)
    _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])


@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))
def _assert_nodes_removed_from_scheduler(scheduler_commands, nodes):
    assert_that(
        scheduler_commands.get_compute_nodes()).does_not_contain(*nodes)


def _assert_compute_logs(remote_command_executor, instance_id):
    remote_command_executor.run_remote_command(
        "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(
            instance_id))
    remote_command_executor.run_remote_command(
        "test -f /tmp/var/log/nodewatcher")
    messages_log = remote_command_executor.run_remote_command(
        "cat /tmp/var/log/nodewatcher", hide=True).stdout
    assert_that(messages_log).contains(
        "Node is marked as down by scheduler or not attached correctly. Terminating..."
    image_builder = boto3.client("imagebuilder")
    image_builder.start_image_pipeline_execution(
        imagePipelineArn=image_builder_pipeline, )
    response = image_builder.list_image_pipeline_images(
        imagePipelineArn=image_builder_pipeline, )

    assert_that(response["imageSummaryList"]).is_length(1)
    image = _wait_for_image_build(image_builder_pipeline)
    logging.info("Image %s", image)
    assert_that(image["state"]["status"]).is_equal_to("AVAILABLE")

    # Wait for 2 minutes for the Lambda to be updated
    time.sleep(120)
    lambda_client = boto3.client("lambda")
    lambda_resource = lambda_client.get_function(FunctionName=lambda_name)
    logging.info("API Lambda %s", lambda_resource)
    assert_that(lambda_resource["Code"]["ImageUri"]).is_equal_to(
        image["outputResources"]["containers"][0]["imageUris"][0])


@retry(
    retry_on_result=lambda result: result["state"]["status"] not in
    {"AVAILABLE", "CANCELLED", "FAILED", "DELETED"},
    wait_fixed=seconds(10),
    stop_max_delay=minutes(15),
)
def _wait_for_image_build(image_builder_pipeline):
    image_builder = boto3.client("imagebuilder")
    return image_builder.list_image_pipeline_images(
        imagePipelineArn=image_builder_pipeline, )["imageSummaryList"][0]
Пример #11
0
        "echo 'Exported by FSx Lustre' > {mount_dir}/file_to_export".format(
            mount_dir=mount_dir))
    remote_command_executor.run_remote_command(
        "sudo lfs hsm_archive {mount_dir}/file_to_export && sleep 5".format(
            mount_dir=mount_dir))
    remote_command_executor.run_remote_command(
        "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export"
        .format(bucket_name=bucket_name))
    result = remote_command_executor.run_remote_command("cat ./file_to_export")
    assert_that(result.stdout).is_equal_to("Exported by FSx Lustre")


@retry(
    retry_on_result=lambda result: result.get("Lifecycle") in
    ["PENDING", "EXECUTING", "CANCELLING"],
    wait_fixed=seconds(5),
    stop_max_delay=minutes(7),
)
def poll_on_data_export(task, fsx):
    logging.info("Data Export Task {task_id}: {status}".format(
        task_id=task.get("TaskId"), status=task.get("Lifecycle")))
    return fsx.describe_data_repository_tasks(
        TaskIds=[task.get("TaskId")]).get("DataRepositoryTasks")[0]


def _test_data_repository_task(remote_command_executor, mount_dir, bucket_name,
                               fsx_fs_id, region):
    logging.info("Testing fsx lustre data repository task")
    file_contents = "Exported by FSx Lustre"
    remote_command_executor.run_remote_command(
        "echo '{file_contents}' > {mount_dir}/file_to_export".format(
# with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
from remote_command_executor import RemoteCommandExecutionError
from retrying import retry
from time_utils import minutes, seconds


@retry(
    retry_on_exception=lambda exception: isinstance(
        exception, RemoteCommandExecutionError),
    wait_fixed=seconds(30),
    stop_max_delay=minutes(15),
)
def wait_compute_log(remote_command_executor, expected_num_nodes=1):
    """Return list of compute node instance_ids in case of failure."""
    remote_command_executor.run_remote_command("test -d /home/logs/compute",
                                               log_error=False)
    output = remote_command_executor.run_remote_command(
        "ls /home/logs/compute/", log_error=False).stdout
    # sample output: "i-049ce596aa69ac988.tar.gz  i-064f07c373d926ba4.tar.gz"
    instance_ids = [
        instance.replace(".tar.gz", "") for instance in output.split()
    ]
    # make sure we got all the expected failing compute nodes
    if len(instance_ids) != expected_num_nodes:
        raise RemoteCommandExecutionError(
Пример #13
0
class SlurmCommands(SchedulerCommands):
    """Implement commands for slurm scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    def wait_job_completed(self, job_id, timeout=None):  # noqa: D102
        if not timeout:
            timeout = 12

        @retry(
            retry_on_result=lambda result: "JobState" not in result or
            any(value in result for value in [
                "EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING",
                "JobState=CONFIGURING"
            ]),
            wait_fixed=seconds(10),
            stop_max_delay=minutes(timeout),
        )
        def _job_status_retryer():
            result = self._remote_command_executor.run_remote_command(
                "scontrol show jobs -o {0}".format(job_id),
                raise_on_error=False)
            return result.stdout

        return _job_status_retryer()

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"ExitCode=(.+?) ", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, sbatch_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(
        self,
        command,
        nodes=0,
        slots=None,
        host=None,
        after_ok=None,
        partition=None,
        constraint=None,
        other_options=None,
        raise_on_error=True,
    ):
        """Submit job with command."""
        job_submit_command = "--wrap='{0}'".format(command)

        return self._submit_batch_job(
            job_submit_command,
            nodes,
            slots,
            host,
            after_ok,
            partition,
            constraint,
            other_options,
            raise_on_error=raise_on_error,
        )

    def submit_script(
        self,
        script,
        script_args=None,
        nodes=0,
        slots=None,
        host=None,
        after_ok=None,
        partition=None,
        constraint=None,
        other_options=None,
        additional_files=None,
        raise_on_error=True,
    ):
        """Submit job with script."""
        if not additional_files:
            additional_files = []
        if not script_args:
            script_args = []
        additional_files.append(script)
        script_name = os.path.basename(script)
        job_submit_command = " {0} {1}".format(script_name,
                                               " ".join(script_args))

        return self._submit_batch_job(
            job_submit_command,
            nodes,
            slots,
            host,
            after_ok,
            partition,
            constraint,
            other_options,
            additional_files,
            raise_on_error=raise_on_error,
        )

    def _submit_batch_job(
        self,
        job_submit_command,
        nodes=0,
        slots=None,
        host=None,
        after_ok=None,
        partition=None,
        constraint=None,
        other_options=None,
        additional_files=None,
        raise_on_error=True,
    ):
        submission_command = "sbatch"
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if nodes > 0:
            submission_command += " -N {0}".format(nodes)
        if after_ok:
            submission_command += " -d afterok:{0}".format(after_ok)
        if partition:
            submission_command += " -p {0}".format(partition)
        if constraint:
            submission_command += " -C '{0}'".format(constraint)
        if other_options:
            submission_command += " {0}".format(other_options)
        submission_command += " {0}".format(job_submit_command)

        if additional_files:
            return self._remote_command_executor.run_remote_command(
                submission_command,
                additional_files=additional_files,
                raise_on_error=raise_on_error)
        else:
            return self._remote_command_executor.run_remote_command(
                submission_command, raise_on_error=raise_on_error)

    def _dump_job_output(self, job_info):
        params = re.split(r"\s+", job_info)
        stderr = None
        stdout = None
        for param in params:
            match_stderr = re.match(r"StdErr=(.*)?", param)
            match_stdout = re.match(r"StdOut=(.*)?", param)
            if match_stderr:
                stderr = match_stderr.group(1)
                logging.info("stderr:" + stderr)
            if match_stdout:
                stdout = match_stdout.group(1)
                logging.info("stdout:" + stdout)
        if stderr is not None or stdout is not None:
            if stderr == stdout:
                result = self._remote_command_executor.run_remote_command(
                    f'echo "stderr/stdout:" && cat {stderr}')
                logging.error(result.stdout)
            else:
                if stderr is not None:
                    stderr_result = self._remote_command_executor.run_remote_command(
                        f'echo "stderr" && cat {stderr}')
                    logging.error(stderr_result.stdout)

                if stdout is not None:
                    stdout_result = self._remote_command_executor.run_remote_command(
                        f'echo "stdout" && cat {stdout}')
                    logging.error(stdout_result.stdout)
        else:
            logging.error("Unable to retrieve job output.")

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        try:
            assert_that(result.stdout).contains("JobState=COMPLETED")
        except AssertionError:
            self._dump_job_output(result.stdout)
            raise

    def compute_nodes_count(self, filter_by_partition=None):  # noqa: D102
        return len(self.get_compute_nodes(filter_by_partition))

    def get_compute_nodes(self, filter_by_partition=None):  # noqa: D102
        command = "sinfo --Node --noheader --responding"
        if filter_by_partition:
            command += " --partition {}".format(filter_by_partition)
        # Print first and fourth columns to get nodename and state only (default partition contains *)
        # Filter out nodes that are not responding or in power saving states
        command += " | awk '{print $1, $4}' | grep -v '[*#~%]' | awk '{print $1}'"
        result = self._remote_command_executor.run_remote_command(command)
        return result.stdout.splitlines()

    @retry(retry_on_result=lambda result: "drain" not in result,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(5))
    def wait_for_locked_node(self):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -h -o '%t'").stdout

    def get_node_cores(self, partition=None):
        """Return number of slots from the scheduler."""
        check_core_cmd = "/opt/slurm/bin/sinfo -o '%c' -h"
        if partition:
            check_core_cmd += " -p {}".format(partition)
        result = self._remote_command_executor.run_remote_command(
            check_core_cmd)
        return re.search(r"(\d+)", result.stdout).group(1)

    def get_job_info(self, job_id):
        """Return job details from slurm"""
        return self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id)).stdout

    def cancel_job(self, job_id):
        """Cancel a job"""
        return self._remote_command_executor.run_remote_command(
            "scancel {}".format(job_id))

    def set_nodes_state(self, compute_nodes, state):
        """Put nodes into a state."""
        self._remote_command_executor.run_remote_command(
            "sudo /opt/slurm/bin/scontrol update NodeName={} state={} reason=testing"
            .format(",".join(compute_nodes), state))

    def set_partition_state(self, partition, state):
        """Put partition into a state."""
        self._remote_command_executor.run_remote_command(
            "sudo /opt/slurm/bin/scontrol update partition={} state={}".format(
                partition, state))

    def get_nodes_status(self, filter_by_nodes=None):
        """Retrieve node state/status from scheduler"""
        result = self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -N --long -h | awk '{print$1, $4}'"
        ).stdout.splitlines()
        current_node_states = {}
        for entry in result:
            nodename, state = entry.split()
            current_node_states[nodename] = state
        return ({
            node: current_node_states.get(node, "Unable to retrieve state")
            for node in filter_by_nodes
        } if filter_by_nodes else current_node_states)

    def get_node_addr_host(self):
        """Return a list of nodename, nodeaddr, nodehostname entries."""
        # q1-dy-c5xlarge-1 172.31.4.241 q1-dy-c5xlarge-1
        # q1-dy-c5xlarge-2 172.31.4.136 q1-dy-c5xlarge-2
        # q1-dy-c5xlarge-3 q1-dy-c5xlarge-3 q1-dy-c5xlarge-3
        return self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'"
        ).stdout.splitlines()

    def submit_command_and_assert_job_accepted(self, submit_command_args):
        """Submit a command and assert the job is accepted by scheduler."""
        result = self.submit_command(**submit_command_args)
        return self.assert_job_submitted(result.stdout)

    def get_partition_state(self, partition):
        """Get the state of the partition."""
        return self._remote_command_executor.run_remote_command(
            f'/opt/slurm/bin/scontrol show partition={partition} | grep -oP "State=\\K(\\S+)"'
        ).stdout

    @retry(wait_fixed=seconds(20), stop_max_delay=minutes(8))
    def wait_job_running(self, job_id):
        """Wait till job starts running."""
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        assert_that(result.stdout).contains("JobState=RUNNING")
def publish_compute_nodes_metric(scheduler_commands, max_monitoring_time,
                                 region, cluster_name):
    logging.info("Monitoring scheduler status and publishing metrics")
    cw_client = boto3.client("cloudwatch", region_name=region)
    compute_nodes_time_series = []
    ec2_nodes_time_series = []
    timestamps = [datetime.datetime.utcnow()]

    @retry(
        # Retry until EC2 and Scheduler capacities scale down to 0
        # Also make sure cluster scaled up before scaling down
        retry_on_result=lambda _: ec2_nodes_time_series[-1] != 0 or
        compute_nodes_time_series[-1] != 0 or max(
            ec2_nodes_time_series) == 0 or max(compute_nodes_time_series) == 0,
        wait_fixed=seconds(20),
        stop_max_delay=max_monitoring_time,
    )
    def _watch_compute_nodes_allocation():
        try:
            compute_nodes = scheduler_commands.compute_nodes_count()
            logging.info(
                "Publishing schedueler compute metric: count={0}".format(
                    compute_nodes))
            cw_client.put_metric_data(
                Namespace="ParallelCluster/benchmarking/{cluster_name}".format(
                    cluster_name=cluster_name),
                MetricData=[{
                    "MetricName": "ComputeNodesCount",
                    "Value": compute_nodes,
                    "Unit": "Count"
                }],
            )
            ec2_instances_count = len(
                _describe_cluster_instances(cluster_name,
                                            region,
                                            filter_by_node_type="Compute"))
            logging.info("Publishing EC2 compute metric: count={0}".format(
                ec2_instances_count))
            cw_client.put_metric_data(
                Namespace="ParallelCluster/benchmarking/{cluster_name}".format(
                    cluster_name=cluster_name),
                MetricData=[{
                    "MetricName": "EC2NodesCount",
                    "Value": ec2_instances_count,
                    "Unit": "Count"
                }],
            )
            # add values only if there is a transition.
            if (len(ec2_nodes_time_series) == 0
                    or ec2_nodes_time_series[-1] != ec2_instances_count
                    or compute_nodes_time_series[-1] != compute_nodes):
                ec2_nodes_time_series.append(ec2_instances_count)
                compute_nodes_time_series.append(compute_nodes)
                timestamps.append(datetime.datetime.utcnow())
        except Exception as e:
            logging.warning(
                "Failed while watching nodes allocation with exception: %s", e)
            raise

    try:
        _watch_compute_nodes_allocation()
    except RetryError:
        # ignoring this error in order to perform assertions on the collected data.
        pass

    end_time = datetime.datetime.utcnow()
    logging.info(
        "Monitoring completed: compute_nodes_time_series [ %s ], timestamps [ %s ]",
        " ".join(map(str, compute_nodes_time_series)),
        " ".join(map(str, timestamps)),
    )
    logging.info(
        "Sleeping for 3 minutes to wait for the metrics to propagate...")
    sleep(180)

    return compute_nodes_time_series, timestamps, end_time
Пример #15
0
class AWSBatchCommands(SchedulerCommands):
    """Implement commands for awsbatch scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(
        retry_on_result=lambda result: "FAILED" not in result and any(
            status != "SUCCEEDED" for status in result),
        wait_fixed=seconds(7),
        stop_max_delay=minutes(15),
    )
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "awsbstat -d {0}".format(job_id), log_output=True)
        return re.findall(r"status\s+: (.+)", result.stdout)

    def get_job_exit_status(self, job_id):  # noqa: D102
        return self.wait_job_completed(job_id)

    def assert_job_submitted(self, awsbsub_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Job ([a-z0-9\-]{36}) \(.+\) has been submitted.",
                          awsbsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self, command, nodes=1, slots=None):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            'echo "{0}" | awsbsub -n {1}'.format(command, nodes))

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      additional_files=None,
                      slots=None):  # noqa: D102
        raise NotImplementedError

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_length(1 + children_number)
        assert_that(status).contains_only("SUCCEEDED")

    def compute_nodes_count(self):  # noqa: D102
        raise NotImplementedError

    def get_compute_nodes(self):  # noqa: D102
        raise NotImplementedError

    def wait_for_locked_node(self):  # noqa: D102
        raise NotImplementedError

    def get_node_cores(self):  # noqa: D102
        raise NotImplementedError

    def set_nodes_state(self, compute_nodes, state):
        """Not implemented."""
        raise NotImplementedError

    def get_nodes_status(self):
        """Not implemented."""
        raise NotImplementedError