Exemplo n.º 1
0
    def test_drain_machine(self):
        run_async(self.htcondor_adapter.drain_machine, drone_uuid="test")
        self.mock_executor.return_value.run_command.assert_called_with(
            "condor_drain -pool my-htcondor.local -test -graceful slot1@test")

        self.mock_executor.reset_mock()

        run_async(self.htcondor_adapter.drain_machine, drone_uuid="test_uuid")
        self.mock_executor.return_value.run_command.assert_called_with(
            "condor_drain -pool my-htcondor.local -test -graceful slot1@test_uuid@test"
        )
        self.assertIsNone(
            run_async(self.htcondor_adapter.drain_machine,
                      drone_uuid="not_exists"))
        self.mock_executor.return_value.run_command.side_effect = (
            CommandExecutionFailure(message="Does not exists",
                                    exit_code=1,
                                    stderr="Does not exists"))
        with self.assertLogs(level=logging.WARNING):
            self.assertIsNone(
                run_async(self.htcondor_adapter.drain_machine,
                          drone_uuid="test"))

        self.mock_executor.return_value.run_command.side_effect = (
            CommandExecutionFailure(message="Unhandled error",
                                    exit_code=2,
                                    stderr="Unhandled error"))

        with self.assertRaises(CommandExecutionFailure):
            with self.assertLogs(level=logging.CRITICAL):
                self.assertIsNone(
                    run_async(self.htcondor_adapter.drain_machine,
                              drone_uuid="test"))

        self.mock_executor.return_value.run_command.side_effect = None
Exemplo n.º 2
0
    def test_exception_handling(self):
        def test_exception_handling(to_raise, to_catch):
            with self.assertRaises(to_catch):
                with self.moab_adapter.handle_exceptions():
                    raise to_raise

        matrix = [
            (asyncio.TimeoutError(), TardisTimeout),
            (
                asyncssh.Error(code=255, reason="Test", lang="Test"),
                TardisResourceStatusUpdateFailed,
            ),
            (IndexError, TardisResourceStatusUpdateFailed),
            (TardisResourceStatusUpdateFailed,
             TardisResourceStatusUpdateFailed),
            (
                CommandExecutionFailure(
                    message="Run test command",
                    exit_code=1,
                    stdout="Test",
                    stderr="Test",
                ),
                TardisResourceStatusUpdateFailed,
            ),
            (Exception, TardisError),
        ]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)
Exemplo n.º 3
0
    def test_exception_handling(self):
        def test_exception_handling(to_raise, to_catch):
            with self.assertRaises(to_catch):
                with self.slurm_adapter.handle_exceptions():
                    raise to_raise

        matrix = [(asyncio.TimeoutError(), TardisTimeout),
                  (CommandExecutionFailure(message="Test",
                                           exit_code=255,
                                           stdout="Test",
                                           stderr="Test"),
                   TardisResourceStatusUpdateFailed),
                  (TardisResourceStatusUpdateFailed,
                   TardisResourceStatusUpdateFailed), (Exception, TardisError)]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)
Exemplo n.º 4
0
    def test_drain_machine(self):
        run_async(self.slurm_adapter.drain_machine, drone_uuid="VM-1")
        self.mock_async_run_command.assert_called_with(
            "scontrol update NodeName=host-10-18-1-1 State=DRAIN Reason='COBalD/TARDIS'"
        )

        self.mock_async_run_command.reset_mock()

        self.assertIsNone(
            run_async(self.slurm_adapter.drain_machine,
                      drone_uuid="not_exists"))
        self.mock_async_run_command.side_effect = CommandExecutionFailure(
            message="Does not exists", exit_code=1, stderr="Does not exists")
        with self.assertRaises(CommandExecutionFailure):
            self.assertIsNone(
                run_async(self.slurm_adapter.drain_machine,
                          drone_uuid="idle_m"))

        self.mock_async_run_command.side_effect = None
Exemplo n.º 5
0
    def test_get_machine_status(self):
        state_mapping = {
            "VM-1": MachineStatus.Available,
            "not_exists": MachineStatus.NotAvailable,
            "draining_m": MachineStatus.Draining,
            "idle_m": MachineStatus.Available,
            "drained_m": MachineStatus.NotAvailable,
            "pwr_up_m": MachineStatus.NotAvailable,
        }

        for machine, state in state_mapping.items():
            self.assertEqual(
                run_async(self.slurm_adapter.get_machine_status, drone_uuid=machine),
                state,
            )

        self.mock_executor.reset_mock()

        self.mock_executor.return_value.run_command.side_effect = (
            CommandExecutionFailure(message="Test", exit_code=123, stderr="Test")
        )

        with self.assertLogs(level="WARN"):
            with self.assertRaises(CommandExecutionFailure):
                attributes = {
                    "Machine": "Machine",
                    "State": "State",
                    "Activity": "Activity",
                    "TardisDroneUuid": "TardisDroneUuid",
                }
                run_async(
                    partial(
                        slurm_status_updater,
                        self.config.BatchSystem.options,
                        attributes,
                        self.mock_executor.return_value,
                    )
                )
                self.mock_executor.return_value.run_command.assert_called_with(
                    self.command
                )

        self.mock_executor.return_value.run_command.side_effect = None
Exemplo n.º 6
0
class TestSlurmAdapter(TestCase):
    mock_config_patcher = None
    mock_executor_patcher = None

    def check_attribute_dicts(self,
                              expected_attributes,
                              returned_attributes,
                              exclude=tuple()):
        for key in expected_attributes.keys():
            if key not in exclude:
                self.assertEqual(getattr(returned_attributes, key),
                                 getattr(expected_attributes, key))

    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch(
            "tardis.interfaces.siteadapter.Configuration")
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch(
            "tardis.adapters.sites.slurm.ShellExecutor")
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        config = self.mock_config.return_value
        config.TestSite = MagicMock(spec=[
            "MachineMetaData",
            "StatusUpdate",
            "MachineTypeConfiguration",
            "executor",
        ])
        self.test_site_config = config.TestSite
        self.test_site_config.MachineMetaData = self.machine_meta_data
        self.test_site_config.StatusUpdate = 10
        self.test_site_config.MachineTypeConfiguration = self.machine_type_configuration
        self.test_site_config.executor = self.mock_executor.return_value

        self.slurm_adapter = SlurmAdapter(machine_type="test2large",
                                          site_name="TestSite")

    def tearDown(self):
        pass

    @property
    def machine_meta_data(self):
        return AttributeDict(
            test2large=AttributeDict(Cores=20, Memory=62, Disk=100))

    @property
    def machine_type_configuration(self):
        return AttributeDict(test2large=AttributeDict(
            Partition="normal", StartupCommand="pilot.sh", Walltime="60"))

    @property
    def resource_attributes(self):
        return AttributeDict(
            machine_type="test2large",
            site_name="TestSite",
            remote_resource_uuid=1390065,
            resource_status=ResourceStatus.Booting,
            drone_uuid="testsite-1390065",
        )

    def test_start_up_command_deprecation_warning(self):
        # Necessary to avoid annoying message in PyCharm
        filterwarnings(action="ignore",
                       message="unclosed",
                       category=ResourceWarning)
        del self.test_site_config.MachineTypeConfiguration.test2large.StartupCommand

        with self.assertRaises(AttributeError):
            self.slurm_adapter = SlurmAdapter(machine_type="test2large",
                                              site_name="TestSite")

        self.test_site_config.StartupCommand = "pilot.sh"

        with self.assertWarns(DeprecationWarning):
            self.slurm_adapter = SlurmAdapter(machine_type="test2large",
                                              site_name="TestSite")

    @mock_executor_run_command(TEST_DEPLOY_RESOURCE_RESPONSE)
    def test_deploy_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(created=datetime.now(),
                                            updated=datetime.now())

        resource_attributes = AttributeDict(
            machine_type="test2large",
            site_name="TestSite",
            obs_machine_meta_data_translation_mapping=AttributeDict(
                Cores=1,
                Memory=1024,
                Disk=1024,
            ),
            drone_uuid="testsite-1390065",
        )

        returned_resource_attributes = run_async(
            self.slurm_adapter.deploy_resource, resource_attributes)

        self.assertLess(
            returned_resource_attributes.created -
            expected_resource_attributes.created,
            timedelta(seconds=1),
        )

        self.check_attribute_dicts(
            expected_resource_attributes,
            returned_resource_attributes,
            exclude=("created", "updated"),
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            "sbatch -p normal -N 1 -n 20 -t 60 --mem=63488mb --export=SLURM_Walltime=60,TardisDroneCores=20,TardisDroneMemory=63488,TardisDroneDisk=102400,TardisDroneUuid=testsite-1390065 pilot.sh"  # noqa: B950
        )

        self.mock_executor.reset_mock()

        self.test_site_config.MachineMetaData.test2large.Memory = 2.5

        run_async(self.slurm_adapter.deploy_resource, resource_attributes)

        self.mock_executor.return_value.run_command.assert_called_with(
            "sbatch -p normal -N 1 -n 20 -t 60 --mem=2560mb --export=SLURM_Walltime=60,TardisDroneCores=20,TardisDroneMemory=2560,TardisDroneDisk=102400,TardisDroneUuid=testsite-1390065 pilot.sh"  # noqa: B950
        )

        self.mock_executor.reset_mock()

        self.test_site_config.MachineMetaData.test2large.Memory = 2.546372129

        run_async(self.slurm_adapter.deploy_resource, resource_attributes)

        self.mock_executor.return_value.run_command.assert_called_with(
            "sbatch -p normal -N 1 -n 20 -t 60 --mem=2607mb --export=SLURM_Walltime=60,TardisDroneCores=20,TardisDroneMemory=2607,TardisDroneDisk=102400,TardisDroneUuid=testsite-1390065 pilot.sh"  # noqa: B950
        )

    @mock_executor_run_command(TEST_DEPLOY_RESOURCE_RESPONSE)
    def test_deploy_resource_w_submit_options(self):
        self.test_site_config.MachineTypeConfiguration.test2large.SubmitOptions = (
            AttributeDict(long=AttributeDict(gres="tmp:1G")))

        slurm_adapter = SlurmAdapter(machine_type="test2large",
                                     site_name="TestSite")

        run_async(
            slurm_adapter.deploy_resource,
            resource_attributes=AttributeDict(
                machine_type="test2large",
                site_name="TestSite",
                obs_machine_meta_data_translation_mapping=AttributeDict(
                    Cores=1,
                    Memory=1000,
                    Disk=1000,
                ),
                drone_uuid="testsite-1390065",
            ),
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            "sbatch -p normal -N 1 -n 20 -t 60 --gres=tmp:1G --mem=63488mb --export=SLURM_Walltime=60,TardisDroneCores=20,TardisDroneMemory=62000,TardisDroneDisk=100000,TardisDroneUuid=testsite-1390065 pilot.sh"  # noqa: B950
        )

    def test_machine_meta_data(self):
        self.assertEqual(self.slurm_adapter.machine_meta_data,
                         self.machine_meta_data["test2large"])

    def test_machine_type(self):
        self.assertEqual(self.slurm_adapter.machine_type, "test2large")

    def test_site_name(self):
        self.assertEqual(self.slurm_adapter.site_name, "TestSite")

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE)
    def test_resource_status(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(updated=datetime.now())

        returned_resource_attributes = run_async(
            self.slurm_adapter.resource_status,
            resource_attributes=self.resource_attributes,
        )

        self.assertLess(
            (returned_resource_attributes.updated -
             expected_resource_attributes.updated),
            timedelta(seconds=1),
        )

        self.check_attribute_dicts(
            expected_resource_attributes,
            returned_resource_attributes,
            exclude=("created", "updated"),
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            'squeue -o "%A|%N|%T" -h -t all')

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE_RUNNING)
    def test_update_resource_status(self):
        self.assertEqual(self.resource_attributes["resource_status"],
                         ResourceStatus.Booting)

        return_resource_attributes = run_async(
            self.slurm_adapter.resource_status,
            resource_attributes=self.resource_attributes,
        )

        self.assertEqual(return_resource_attributes["resource_status"],
                         ResourceStatus.Running)

        self.assertEqual(
            return_resource_attributes["drone_uuid"],
            self.resource_attributes["drone_uuid"],
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            'squeue -o "%A|%N|%T" -h -t all')

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE_ALL_STATES)
    def test_resource_state_translation(self):
        state_translations = {
            "BOOT_FAIL": ResourceStatus.Error,
            "CANCELLED": ResourceStatus.Deleted,
            "COMPLETED": ResourceStatus.Deleted,
            "CONFIGURING": ResourceStatus.Booting,
            "COMPLETING": ResourceStatus.Running,
            "DEADLINE": ResourceStatus.Error,
            "FAILED": ResourceStatus.Error,
            "NODE_FAIL": ResourceStatus.Error,
            "OUT_OF_MEMORY": ResourceStatus.Error,
            "PENDING": ResourceStatus.Booting,
            "PREEMPTED": ResourceStatus.Deleted,
            "RUNNING": ResourceStatus.Running,
            "RESV_DEL_HOLD": ResourceStatus.Stopped,
            "REQUEUE_FED": ResourceStatus.Booting,
            "REQUEUE_HOLD": ResourceStatus.Booting,
            "REQUEUED": ResourceStatus.Booting,
            "RESIZING": ResourceStatus.Running,
            "REVOKED": ResourceStatus.Error,
            "SIGNALING": ResourceStatus.Running,
            "SPECIAL_EXIT": ResourceStatus.Booting,
            "STAGE_OUT": ResourceStatus.Running,
            "STOPPED": ResourceStatus.Stopped,
            "SUSPENDED": ResourceStatus.Stopped,
            "TIMEOUT": ResourceStatus.Error,
        }

        for id, value in enumerate(state_translations.values()):
            job_id = int(f"{id + 1000}000")
            returned_resource_attributes = run_async(
                self.slurm_adapter.resource_status,
                AttributeDict(remote_resource_uuid=job_id),
            )
            self.assertEqual(returned_resource_attributes.resource_status,
                             value)

        self.mock_executor.return_value.run_command.called_once()

        self.mock_executor.return_value.run_command.assert_called_with(
            'squeue -o "%A|%N|%T" -h -t all')

    def test_resource_status_raise_update_failed(self):
        # Update interval is 10 minutes, so turn back last update by 2 minutes
        # and creation date to current date, so that
        # TardisResourceStatusUpdateFailed is raised
        created_timestamp = datetime.now()
        new_timestamp = datetime.now() - timedelta(minutes=2)

        self.slurm_adapter._slurm_status._last_update = new_timestamp

        with self.assertRaises(TardisResourceStatusUpdateFailed):
            run_async(
                self.slurm_adapter.resource_status,
                AttributeDict(
                    remote_resource_uuid=1351043,
                    resource_state=ResourceStatus.Booting,
                    created=created_timestamp,
                ),
            )

    @mock_executor_run_command("")
    def test_resource_status_of_completed_jobs(self):
        # Update interval is 10 minutes, so turn back last update by 11 minutes
        # and creation date to 12 minutes ago. => squeue should be executed
        # The empty string returned by squeue represents a resource is already
        # gone. So, ResourceStatus returned should be Deleted.
        past_timestamp = datetime.now() - timedelta(minutes=12)
        new_timestamp = datetime.now() - timedelta(minutes=11)
        self.slurm_adapter._slurm_status._last_update = new_timestamp

        response = run_async(
            self.slurm_adapter.resource_status,
            AttributeDict(
                resource_id="1390065",
                remote_resource_uuid="1351043",
                created=past_timestamp,
            ),
        )

        self.assertEqual(response.resource_status, ResourceStatus.Deleted)

        self.mock_executor.return_value.run_command.assert_called_with(
            'squeue -o "%A|%N|%T" -h -t all')

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(message="Failed",
                                                stdout="Failed",
                                                stderr="Failed",
                                                exit_code=2),
    )
    def test_resource_status_update_failed(self):
        # set previous data, should be returned when update fails
        self.slurm_adapter._slurm_status._data = {
            "1390065": {
                "JobId": "1390065",
                "Host": "fh2n1552",
                "State": "RUNNING"
            }
        }

        with self.assertLogs(level=logging.WARNING):
            response = run_async(
                self.slurm_adapter.resource_status,
                AttributeDict(remote_resource_uuid="1390065"),
            )

        self.check_attribute_dicts(
            AttributeDict(
                remote_resource_uuid=1390065,
                resource_status=ResourceStatus.Running,
                updated=datetime.now(),
            ),
            response,
            exclude=("updated", ),
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            'squeue -o "%A|%N|%T" -h -t all')

    @mock_executor_run_command(stdout="", stderr="", exit_code=0)
    def test_stop_resource(self):
        run_async(
            self.slurm_adapter.stop_resource,
            resource_attributes=self.resource_attributes,
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            "scancel 1390065")

    @mock_executor_run_command(stdout="", stderr="", exit_code=0)
    def test_terminate_resource(self):
        run_async(
            self.slurm_adapter.terminate_resource,
            resource_attributes=self.resource_attributes,
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            "scancel 1390065")

    def test_exception_handling(self):
        def test_exception_handling(to_raise, to_catch):
            with self.assertRaises(to_catch):
                with self.assertLogs(level=logging.WARNING):
                    with self.slurm_adapter.handle_exceptions():
                        raise to_raise

        matrix = [
            (asyncio.TimeoutError(), TardisTimeout),
            (
                CommandExecutionFailure(message="Test",
                                        exit_code=255,
                                        stdout="Test",
                                        stderr="Test"),
                TardisResourceStatusUpdateFailed,
            ),
            (TardisResourceStatusUpdateFailed,
             TardisResourceStatusUpdateFailed),
            (Exception, TardisError),
        ]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)
Exemplo n.º 7
0
    def test_get_machine_status(self):
        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="test"),
            MachineStatus.Available,
        )
        self.mock_executor.return_value.run_command.assert_called_with(
            self.command)
        self.mock_executor.reset_mock()
        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="not_exists"),
            MachineStatus.NotAvailable,
        )
        self.mock_executor.reset_mock()
        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="test_drain"),
            MachineStatus.Draining,
        )
        self.mock_executor.reset_mock()
        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="test_drained"),
            MachineStatus.Drained,
        )
        self.mock_executor.reset_mock()
        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="test_owner"),
            MachineStatus.NotAvailable,
        )
        self.mock_executor.reset_mock()

        self.assertEqual(
            run_async(self.htcondor_adapter.get_machine_status,
                      drone_uuid="test_uuid"),
            MachineStatus.Available,
        )
        self.mock_executor.reset_mock()

        self.mock_executor.return_value.run_command.side_effect = (
            CommandExecutionFailure(message="Test",
                                    exit_code=123,
                                    stderr="Test"))
        with self.assertLogs(level=logging.WARNING):
            with self.assertRaises(CommandExecutionFailure):
                attributes = {
                    "Machine": "Machine",
                    "State": "State",
                    "Activity": "Activity",
                    "TardisDroneUuid": "TardisDroneUuid",
                }
                # Escape htcondor expressions and add them to attributes
                attributes.update({
                    key: quote(value)
                    for key, value in self.config.BatchSystem.ratios.items()
                })
                run_async(
                    partial(
                        htcondor_status_updater,
                        self.config.BatchSystem.options,
                        attributes,
                        self.mock_executor.return_value,
                    ))
                self.mock_executor.return_value.run_command.assert_called_with(
                    self.command)
        self.mock_executor.return_value.run_command.side_effect = None
Exemplo n.º 8
0
class TestHTCondorSiteAdapter(TestCase):
    mock_config_patcher = None
    mock_executor_patcher = None

    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch(
            "tardis.adapters.sites.htcondor.Configuration")
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch(
            "tardis.adapters.sites.htcondor.ShellExecutor")
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        config = self.mock_config.return_value
        test_site_config = config.TestSite
        test_site_config.MachineMetaData = self.machine_meta_data
        test_site_config.MachineTypeConfiguration = self.machine_type_configuration
        test_site_config.executor = self.mock_executor.return_value
        test_site_config.max_age = 10

        self.adapter = HTCondorAdapter(machine_type="test2large",
                                       site_name="TestSite")

    @property
    def machine_meta_data(self):
        return AttributeDict(
            test2large=AttributeDict(Cores=8, Memory=32, Disk=160),
            testunkownresource=AttributeDict(Cores=8,
                                             Memory=32,
                                             Disk=160,
                                             Foo=3),
        )

    @property
    def machine_type_configuration(self):
        return AttributeDict(
            test2large=AttributeDict(jdl="tests/data/submit.jdl"),
            testunkownresource=AttributeDict(jdl="tests/data/submit.jdl"),
        )

    @mock_executor_run_command(stdout=CONDOR_SUBMIT_OUTPUT)
    def test_deploy_resource(self):
        response = run_async(self.adapter.deploy_resource,
                             AttributeDict(drone_uuid="test-123"))
        self.assertEqual(response.remote_resource_uuid, "1351043")
        self.assertFalse(
            response.created - datetime.now() > timedelta(seconds=1))
        self.assertFalse(
            response.updated - datetime.now() > timedelta(seconds=1))

        self.mock_executor.return_value.run_command.assert_called_with(
            "condor_submit", stdin_input=CONDOR_SUBMIT_JDL)
        self.mock_executor.reset()

    def test_translate_resources_raises_logs(self):
        self.adapter = HTCondorAdapter(machine_type="testunkownresource",
                                       site_name="TestSite")
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            with self.assertRaises(KeyError):
                run_async(self.adapter.deploy_resource,
                          AttributeDict(drone_uuid="test-123"))

    def test_machine_meta_data(self):
        self.assertEqual(self.adapter.machine_meta_data,
                         self.machine_meta_data.test2large)

    def test_machine_type(self):
        self.assertEqual(self.adapter.machine_type, "test2large")

    def test_site_name(self):
        self.assertEqual(self.adapter.site_name, "TestSite")

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_IDLE)
    def test_resource_status_idle(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Booting)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_RUN)
    def test_resource_status_run(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Running)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_REMOVING)
    def test_resource_status_removing(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Running)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_COMPLETED)
    def test_resource_status_completed(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Deleted)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_HELD)
    def test_resource_status_held(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Error)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_TRANSFERING_OUTPUT)
    def test_resource_status_transfering_output(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Running)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_SUSPENDED)
    def test_resource_status_unexpanded(self):
        response = run_async(self.adapter.resource_status,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Stopped)

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(message="Failed",
                                                stdout="Failed",
                                                stderr="Failed",
                                                exit_code=2),
    )
    def test_resource_status_raise_future(self):
        future_timestamp = datetime.now() + timedelta(minutes=1)
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            with self.assertRaises(TardisResourceStatusUpdateFailed):
                run_async(
                    self.adapter.resource_status,
                    AttributeDict(remote_resource_uuid="1351043",
                                  created=future_timestamp),
                )

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(message="Failed",
                                                stdout="Failed",
                                                stderr="Failed",
                                                exit_code=2),
    )
    def test_resource_status_raise_past(self):
        # Update interval is 10 minutes, so set last update back by 11 minutes in order to execute condor_q command and
        # creation date to 12 minutes ago
        past_timestamp = datetime.now() - timedelta(minutes=12)
        self.adapter._htcondor_queue._last_update = datetime.now() - timedelta(
            minutes=11)
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            response = run_async(
                self.adapter.resource_status,
                AttributeDict(remote_resource_uuid="1351043",
                              created=past_timestamp),
            )
        self.assertEqual(response.resource_status, ResourceStatus.Deleted)

    @mock_executor_run_command(stdout=CONDOR_SUSPEND_OUTPUT)
    def test_stop_resource(self):
        response = run_async(self.adapter.stop_resource,
                             AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.remote_resource_uuid, "1351043")

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(
            message=CONDOR_SUSPEND_FAILED_MESSAGE,
            exit_code=1,
            stderr=CONDOR_SUSPEND_FAILED_OUTPUT,
            stdout="",
            stdin="",
        ),
    )
    def test_stop_resource_failed_redo(self):
        with self.assertRaises(TardisResourceStatusUpdateFailed):
            run_async(
                self.adapter.stop_resource,
                AttributeDict(remote_resource_uuid="1351043"),
            )

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(
            message=CONDOR_SUSPEND_FAILED_MESSAGE,
            exit_code=2,
            stderr=CONDOR_SUSPEND_FAILED_OUTPUT,
            stdout="",
            stdin="",
        ),
    )
    def test_stop_resource_failed_raise(self):
        with self.assertRaises(CommandExecutionFailure):
            run_async(
                self.adapter.stop_resource,
                AttributeDict(remote_resource_uuid="1351043"),
            )

    @mock_executor_run_command(stdout=CONDOR_RM_OUTPUT)
    def test_terminate_resource(self):
        response = run_async(
            self.adapter.terminate_resource,
            AttributeDict(remote_resource_uuid="1351043"),
        )
        self.assertEqual(response.remote_resource_uuid, "1351043")

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(
            message=CONDOR_RM_FAILED_MESSAGE,
            exit_code=1,
            stderr=CONDOR_RM_FAILED_OUTPUT,
            stdout="",
            stdin="",
        ),
    )
    def test_terminate_resource_failed_redo(self):
        with self.assertRaises(TardisResourceStatusUpdateFailed):
            run_async(
                self.adapter.terminate_resource,
                AttributeDict(remote_resource_uuid="1351043"),
            )

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(
            message=CONDOR_RM_FAILED_MESSAGE,
            exit_code=2,
            stderr=CONDOR_RM_FAILED_OUTPUT,
            stdout="",
            stdin="",
        ),
    )
    def test_terminate_resource_failed_raise(self):
        with self.assertRaises(CommandExecutionFailure):
            run_async(
                self.adapter.terminate_resource,
                AttributeDict(remote_resource_uuid="1351043"),
            )

    def test_exception_handling(self):
        def test_exception_handling(raise_it, catch_it):
            with self.assertRaises(catch_it):
                with self.adapter.handle_exceptions():
                    raise raise_it

        matrix = [
            (Exception, TardisError),
            (TardisResourceStatusUpdateFailed,
             TardisResourceStatusUpdateFailed),
        ]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)
Exemplo n.º 9
0
class TestHTCondorSiteAdapter(TestCase):
    mock_config_patcher = None
    mock_executor_patcher = None

    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch('tardis.adapters.sites.htcondor.Configuration')
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch('tardis.adapters.sites.htcondor.ShellExecutor')
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        config = self.mock_config.return_value
        test_site_config = config.TestSite
        test_site_config.MachineMetaData = self.machine_meta_data
        test_site_config.MachineTypeConfiguration = self.machine_type_configuration
        test_site_config.executor = self.mock_executor.return_value
        test_site_config.max_age = 10

        self.adapter = HTCondorAdapter(machine_type='test2large', site_name='TestSite')

    @property
    def machine_meta_data(self):
        return AttributeDict(test2large=AttributeDict(Cores=8, Memory=32),
                             testunkownresource=AttributeDict(Cores=8, Memory=32, Foo=3))

    @property
    def machine_type_configuration(self):
        return AttributeDict(test2large=AttributeDict(jdl='submit.jdl'),
                             testunkownresource=AttributeDict(jdl='submit.jdl'))

    @mock_executor_run_command(stdout=CONDOR_SUBMIT_OUTPUT)
    def test_deploy_resource(self):
        response = run_async(self.adapter.deploy_resource, AttributeDict(drone_uuid='test-123'))
        self.assertEqual(response.remote_resource_uuid, "1351043")
        self.assertFalse(response.created - datetime.now() > timedelta(seconds=1))
        self.assertFalse(response.updated - datetime.now() > timedelta(seconds=1))

        self.mock_executor.return_value.run_command.assert_called_with(
            'condor_submit -append "environment = TardisDroneUuid=test-123;TardisDroneCores=8;TardisDroneMemory=32768"'
            ' -a "request_cpus = 8" -a "request_memory = 32768" submit.jdl')
        self.mock_executor.reset()

    def test_translate_resources_raises_logs(self):
        self.adapter = HTCondorAdapter(machine_type='testunkownresource', site_name='TestSite')
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            with self.assertRaises(KeyError):
                run_async(self.adapter.deploy_resource, AttributeDict(drone_uuid='test-123'))

    def test_machine_meta_data(self):
        self.assertEqual(self.adapter.machine_meta_data, self.machine_meta_data.test2large)

    def test_machine_type(self):
        self.assertEqual(self.adapter.machine_type, 'test2large')

    def test_site_name(self):
        self.assertEqual(self.adapter.site_name, 'TestSite')

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_UNEXANPANDED)
    def test_resource_status_unexpanded(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Error)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_IDLE)
    def test_resource_status_idle(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Booting)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_RUN)
    def test_resource_status_run(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Running)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_COMPLETED)
    def test_resource_status_idle(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Stopped)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_HELD)
    def test_resource_status_idle(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Error)

    @mock_executor_run_command(stdout=CONDOR_Q_OUTPUT_SUBMISSION_ERR)
    def test_resource_status_idle(self):
        response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.resource_status, ResourceStatus.Error)

    @mock_executor_run_command(stdout="", raise_exception=CommandExecutionFailure(message="Failed", stdout="Failed",
                                                                                  stderr="Failed", exit_code=2))
    def test_resource_status_raise_future(self):
        future_timestamp = datetime.now() + timedelta(minutes=1)
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            with self.assertRaises(TardisResourceStatusUpdateFailed):
                run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043",
                                                                      created=future_timestamp))

    @mock_executor_run_command(stdout="", raise_exception=CommandExecutionFailure(message="Failed", stdout="Failed",
                                                                                  stderr="Failed", exit_code=2))
    def test_resource_status_raise_past(self):
        # Update interval is 10 minutes, so set last update back by 11 minutes in order to execute condor_q command and
        # creation date to 12 minutes ago
        past_timestamp = datetime.now() - timedelta(minutes=12)
        self.adapter._htcondor_queue._last_update = datetime.now() - timedelta(minutes=11)
        with self.assertLogs(logging.getLogger(), logging.ERROR):
            response = run_async(self.adapter.resource_status, AttributeDict(remote_resource_uuid="1351043",
                                                                             created=past_timestamp))
        self.assertEqual(response.resource_status, ResourceStatus.Deleted)

    @mock_executor_run_command(stdout=CONDOR_RM_OUTPUT)
    def test_stop_resource(self):
        response = run_async(self.adapter.stop_resource, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.remote_resource_uuid, "1351043")

    @mock_executor_run_command(stdout=CONDOR_RM_OUTPUT)
    def test_terminate_resource(self):
        response = run_async(self.adapter.terminate_resource, AttributeDict(remote_resource_uuid="1351043"))
        self.assertEqual(response.remote_resource_uuid, "1351043")

    def test_exception_handling(self):
        def test_exception_handling(raise_it, catch_it):
            with self.assertRaises(catch_it):
                with self.adapter.handle_exceptions():
                    raise raise_it

        matrix = [(Exception, TardisError),
                  (TardisResourceStatusUpdateFailed, TardisResourceStatusUpdateFailed)]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)
Exemplo n.º 10
0
 async def command_failing_update_function(self):
     raise CommandExecutionFailure(message='Failure', stdout='Failure', stderr='Failure', exit_code=2)
Exemplo n.º 11
0
class TestMoabAdapter(TestCase):
    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch(
            'tardis.adapters.sites.moab.Configuration')
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch(
            'tardis.adapters.sites.moab.ShellExecutor')
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        config = self.mock_config.return_value
        self.test_site_config = config.TestSite
        self.test_site_config.MachineMetaData = self.machine_meta_data
        self.test_site_config.StartupCommand = 'startVM.py'
        self.test_site_config.StatusUpdate = 10
        self.test_site_config.MachineTypeConfiguration = self.machine_type_configuration
        self.test_site_config.executor = self.mock_executor.return_value

        self.moab_adapter = MoabAdapter(machine_type='test2large',
                                        site_name='TestSite')

    def tearDown(self):
        pass

    @property
    def machine_meta_data(self):
        return AttributeDict(test2large=AttributeDict(Cores=128, Memory='120'))

    @property
    def machine_type_configuration(self):
        return AttributeDict(test2large=AttributeDict(NodeType='1:ppn=20',
                                                      Walltime='02:00:00:00'))

    @property
    def resource_attributes(self):
        return AttributeDict(
            machine_type='test2large',
            site_name='TestSite',
            remote_resource_uuid=4761849,
            resource_status=ResourceStatus.Booting,
            created=datetime.strptime("Wed Jan 23 2019 15:01:47",
                                      '%a %b %d %Y %H:%M:%S'),
            updated=datetime.strptime("Wed Jan 23 2019 15:02:17",
                                      '%a %b %d %Y %H:%M:%S'),
            drone_uuid='testsite-4761849')

    @mock_executor_run_command(TEST_DEPLOY_RESOURCE_RESPONSE)
    def test_deploy_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(created=datetime.now(),
                                            updated=datetime.now())
        return_resource_attributes = run_async(
            self.moab_adapter.deploy_resource,
            resource_attributes=AttributeDict(machine_type='test2large',
                                              site_name='TestSite'))
        if return_resource_attributes.created - expected_resource_attributes.created > timedelta(seconds=1) or \
                return_resource_attributes.updated - expected_resource_attributes.updated > timedelta(seconds=1):
            raise Exception("Creation time or update time wrong!")
        del expected_resource_attributes.created, expected_resource_attributes.updated, \
            return_resource_attributes.created, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)
        self.mock_executor.return_value.run_command.assert_called_with(
            'msub -j oe -m p -l walltime=02:00:00:00,mem=120gb,nodes=1:ppn=20 startVM.py'
        )

    def test_machine_meta_data(self):
        self.assertEqual(self.moab_adapter.machine_meta_data,
                         self.machine_meta_data['test2large'])

    def test_machine_type(self):
        self.assertEqual(self.moab_adapter.machine_type, 'test2large')

    def test_site_name(self):
        self.assertEqual(self.moab_adapter.site_name, 'TestSite')

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE)
    def test_resource_status(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(updated=datetime.now())
        return_resource_attributes = run_async(
            self.moab_adapter.resource_status,
            resource_attributes=self.resource_attributes)
        if return_resource_attributes.updated - expected_resource_attributes.updated > timedelta(
                seconds=1):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE_RUNNING)
    def test_resource_status_update(self):
        self.assertEqual(self.resource_attributes["resource_status"],
                         ResourceStatus.Booting)
        return_resource_attributes = run_async(
            self.moab_adapter.resource_status,
            resource_attributes=self.resource_attributes)
        self.assertEqual(return_resource_attributes["resource_status"],
                         ResourceStatus.Running)

    @mock_executor_run_command(TEST_TERMINATE_RESOURCE_RESPONSE)
    def test_stop_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.stop_resource,
            resource_attributes=self.resource_attributes)
        if return_resource_attributes.updated - expected_resource_attributes.updated > timedelta(
                seconds=1):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(TEST_TERMINATE_RESOURCE_RESPONSE)
    def test_terminate_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.terminate_resource,
            resource_attributes=self.resource_attributes)
        if return_resource_attributes.updated - expected_resource_attributes.updated > timedelta(
                seconds=1):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(
        "",
        stderr=TEST_TERMINATE_DEAD_RESOURCE_RESPONSE,
        exit_code=1,
        raise_exception=CommandExecutionFailure(
            message='Test',
            stdout="",
            stderr=TEST_TERMINATE_DEAD_RESOURCE_RESPONSE,
            exit_code=1))
    def test_terminate_dead_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.terminate_resource,
            resource_attributes=self.resource_attributes)
        self.assertEqual(return_resource_attributes["resource_status"],
                         ResourceStatus.Stopped)

    @mock_executor_run_command("",
                               exit_code=2,
                               raise_exception=CommandExecutionFailure(
                                   message='Test',
                                   stdout="",
                                   stderr="",
                                   exit_code=2))
    def test_terminate_resource_error(self):
        with self.assertRaises(CommandExecutionFailure):
            run_async(self.moab_adapter.terminate_resource,
                      resource_attributes=self.resource_attributes)

    def test_resource_status_raise(self):
        # Update interval is 10 minutes, so set last update back by 2 minutes in order to execute sacct command and
        # creation date to current date
        created_timestamp = datetime.now()
        new_timestamp = datetime.now() - timedelta(minutes=2)
        self.moab_adapter._moab_status._last_update = new_timestamp
        with self.assertRaises(TardisResourceStatusUpdateFailed):
            response = run_async(
                self.moab_adapter.resource_status,
                AttributeDict(resource_id=1351043,
                              remote_resource_uuid=1351043,
                              resource_state=ResourceStatus.Booting,
                              created=created_timestamp))

    def test_resource_status_raise_past(self):
        # Update interval is 10 minutes, so set last update back by 11 minutes in order to execute sacct command and
        # creation date to 12 minutes ago
        past_timestamp = datetime.now() - timedelta(minutes=12)
        new_timestamp = datetime.now() - timedelta(minutes=11)
        self.moab_adapter._moab_status._last_update = new_timestamp
        response = run_async(
            self.moab_adapter.resource_status,
            AttributeDict(resource_id=1390065,
                          remote_resource_uuid=1351043,
                          created=past_timestamp))
        self.assertEqual(response.resource_status, ResourceStatus.Stopped)

    def test_exception_handling(self):
        def test_exception_handling(to_raise, to_catch):
            with self.assertRaises(to_catch):
                with self.moab_adapter.handle_exceptions():
                    raise to_raise

        matrix = [(asyncio.TimeoutError(), TardisTimeout),
                  (asyncssh.Error(code=255, reason="Test", lang="Test"),
                   TardisResourceStatusUpdateFailed),
                  (IndexError, TardisResourceStatusUpdateFailed),
                  (TardisResourceStatusUpdateFailed,
                   TardisResourceStatusUpdateFailed),
                  (CommandExecutionFailure(message="Run test command",
                                           exit_code=1,
                                           stdout="Test",
                                           stderr="Test"),
                   TardisResourceStatusUpdateFailed), (Exception, TardisError)]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)

    def test_check_remote_resource_uuid(self):
        with self.assertRaises(TardisError):
            self.moab_adapter.check_remote_resource_uuid(
                AttributeDict(remote_resource_uuid=1),
                regex=r"^(\d)$",
                response="2")
Exemplo n.º 12
0
class TestMoabAdapter(TestCase):
    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch(
            "tardis.adapters.sites.moab.Configuration")
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch(
            "tardis.adapters.sites.moab.ShellExecutor")
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        config = self.mock_config.return_value
        config.TestSite = MagicMock(spec=[
            "MachineMetaData",
            "StatusUpdate",
            "MachineTypeConfiguration",
            "executor",
        ])
        self.test_site_config = config.TestSite
        self.test_site_config.MachineMetaData = self.machine_meta_data
        self.test_site_config.StatusUpdate = 10
        self.test_site_config.MachineTypeConfiguration = self.machine_type_configuration
        self.test_site_config.executor = self.mock_executor.return_value

        self.moab_adapter = MoabAdapter(machine_type="test2large",
                                        site_name="TestSite")

    def tearDown(self):
        pass

    @property
    def machine_meta_data(self):
        return AttributeDict(test2large=AttributeDict(Cores=128, Memory="120"))

    @property
    def machine_type_configuration(self):
        return AttributeDict(
            test2large=AttributeDict(NodeType="1:ppn=20",
                                     StartupCommand="startVM.py",
                                     Walltime="02:00:00:00"))

    @property
    def resource_attributes(self):
        return AttributeDict(
            machine_type="test2large",
            site_name="TestSite",
            remote_resource_uuid=4761849,
            resource_status=ResourceStatus.Booting,
            created=datetime.strptime("Wed Jan 23 2019 15:01:47",
                                      "%a %b %d %Y %H:%M:%S"),
            updated=datetime.strptime("Wed Jan 23 2019 15:02:17",
                                      "%a %b %d %Y %H:%M:%S"),
            drone_uuid="testsite-4761849",
        )

    def test_start_up_command_deprecation_warning(self):
        # Necessary to avoid annoying message in PyCharm
        filterwarnings(action="ignore",
                       message="unclosed",
                       category=ResourceWarning)
        del self.test_site_config.MachineTypeConfiguration.test2large.StartupCommand

        with self.assertRaises(AttributeError):
            self.moab_adapter = MoabAdapter(machine_type="test2large",
                                            site_name="TestSite")

        self.test_site_config.StartupCommand = "startVM.py"

        with self.assertWarns(DeprecationWarning):
            self.moab_adapter = MoabAdapter(machine_type="test2large",
                                            site_name="TestSite")

    @mock_executor_run_command(TEST_DEPLOY_RESOURCE_RESPONSE)
    def test_deploy_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(created=datetime.now(),
                                            updated=datetime.now())
        return_resource_attributes = run_async(
            self.moab_adapter.deploy_resource,
            resource_attributes=AttributeDict(machine_type="test2large",
                                              site_name="TestSite"),
        )
        if return_resource_attributes.created - expected_resource_attributes.created > timedelta(
                seconds=1
        ) or return_resource_attributes.updated - expected_resource_attributes.updated > timedelta(
                seconds=1):
            raise Exception("Creation time or update time wrong!")
        del (
            expected_resource_attributes.created,
            expected_resource_attributes.updated,
            return_resource_attributes.created,
            return_resource_attributes.updated,
        )
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)
        self.mock_executor.return_value.run_command.assert_called_with(
            "msub -j oe -m p -l walltime=02:00:00:00,mem=120gb,nodes=1:ppn=20 startVM.py"
        )

    def test_machine_meta_data(self):
        self.assertEqual(self.moab_adapter.machine_meta_data,
                         self.machine_meta_data["test2large"])

    def test_machine_type(self):
        self.assertEqual(self.moab_adapter.machine_type, "test2large")

    def test_site_name(self):
        self.assertEqual(self.moab_adapter.site_name, "TestSite")

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE)
    def test_resource_status(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(updated=datetime.now())
        return_resource_attributes = run_async(
            self.moab_adapter.resource_status,
            resource_attributes=self.resource_attributes,
        )
        if (return_resource_attributes.updated -
                expected_resource_attributes.updated > timedelta(seconds=1)):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(TEST_RESOURCE_STATE_TRANSLATION_RESPONSE)
    def test_resource_state_translation(self):
        for num, (_, state) in enumerate(STATE_TRANSLATIONS):
            job_id = f"76242{num:02}"
            return_resource_attributes = run_async(
                self.moab_adapter.resource_status,
                AttributeDict(remote_resource_uuid=job_id),
            )
            self.assertEqual(return_resource_attributes.resource_status, state)

        self.mock_executor.return_value.run_command.assert_called_with(
            "showq --xml -w user=$(whoami) && showq -c --xml -w user=$(whoami)"
        )

    @mock_executor_run_command(TEST_RESOURCE_STATUS_RESPONSE_RUNNING)
    def test_resource_status_update(self):
        self.assertEqual(self.resource_attributes["resource_status"],
                         ResourceStatus.Booting)
        return_resource_attributes = run_async(
            self.moab_adapter.resource_status,
            resource_attributes=self.resource_attributes,
        )
        self.assertEqual(return_resource_attributes["resource_status"],
                         ResourceStatus.Running)

    @mock_executor_run_command(TEST_TERMINATE_RESOURCE_RESPONSE)
    def test_stop_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.stop_resource,
            resource_attributes=self.resource_attributes,
        )
        if (return_resource_attributes.updated -
                expected_resource_attributes.updated > timedelta(seconds=1)):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(TEST_TERMINATE_RESOURCE_RESPONSE)
    def test_terminate_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.terminate_resource,
            resource_attributes=self.resource_attributes,
        )
        if (return_resource_attributes.updated -
                expected_resource_attributes.updated > timedelta(seconds=1)):
            raise Exception("Update time wrong!")
        del expected_resource_attributes.updated, return_resource_attributes.updated
        self.assertEqual(return_resource_attributes,
                         expected_resource_attributes)

    @mock_executor_run_command(
        "",
        stderr=TEST_TERMINATE_DEAD_RESOURCE_RESPONSE,
        exit_code=1,
        raise_exception=CommandExecutionFailure(
            message="Test",
            stdout="",
            stderr=TEST_TERMINATE_DEAD_RESOURCE_RESPONSE,
            exit_code=1,
        ),
    )
    def test_terminate_dead_resource(self):
        expected_resource_attributes = self.resource_attributes
        expected_resource_attributes.update(
            updated=datetime.now(), resource_status=ResourceStatus.Stopped)
        return_resource_attributes = run_async(
            self.moab_adapter.terminate_resource,
            resource_attributes=self.resource_attributes,
        )
        self.assertEqual(return_resource_attributes["resource_status"],
                         ResourceStatus.Stopped)

    @mock_executor_run_command(
        "",
        exit_code=2,
        raise_exception=CommandExecutionFailure(message="Test",
                                                stdout="",
                                                stderr="",
                                                exit_code=2),
    )
    def test_terminate_resource_error(self):
        with self.assertRaises(CommandExecutionFailure):
            run_async(
                self.moab_adapter.terminate_resource,
                resource_attributes=self.resource_attributes,
            )

    def test_resource_status_raise(self):
        # Update interval is 10 minutes, so set last update back by 2 minutes in order to execute sacct command and
        # creation date to current date
        created_timestamp = datetime.now()
        new_timestamp = datetime.now() - timedelta(minutes=2)
        self.moab_adapter._moab_status._last_update = new_timestamp
        with self.assertRaises(TardisResourceStatusUpdateFailed):
            run_async(
                self.moab_adapter.resource_status,
                AttributeDict(
                    resource_id=1351043,
                    remote_resource_uuid=1351043,
                    resource_state=ResourceStatus.Booting,
                    created=created_timestamp,
                ),
            )

    def test_resource_status_raise_past(self):
        # Update interval is 10 minutes, so set last update back by 11 minutes in order to execute sacct command and
        # creation date to 12 minutes ago
        creation_timestamp = datetime.now() - timedelta(minutes=12)
        last_update_timestamp = datetime.now() - timedelta(minutes=11)
        self.moab_adapter._moab_status._last_update = last_update_timestamp
        response = run_async(
            self.moab_adapter.resource_status,
            AttributeDict(
                resource_id=1390065,
                remote_resource_uuid=1351043,
                created=creation_timestamp,
            ),
        )
        self.assertEqual(response.resource_status, ResourceStatus.Deleted)

    def test_exception_handling(self):
        def test_exception_handling(to_raise, to_catch):
            with self.assertRaises(to_catch):
                with self.moab_adapter.handle_exceptions():
                    raise to_raise

        matrix = [
            (asyncio.TimeoutError(), TardisTimeout),
            (
                asyncssh.Error(code=255, reason="Test", lang="Test"),
                TardisResourceStatusUpdateFailed,
            ),
            (IndexError, TardisResourceStatusUpdateFailed),
            (TardisResourceStatusUpdateFailed,
             TardisResourceStatusUpdateFailed),
            (
                CommandExecutionFailure(
                    message="Run test command",
                    exit_code=1,
                    stdout="Test",
                    stderr="Test",
                ),
                TardisResourceStatusUpdateFailed,
            ),
            (Exception, TardisError),
        ]

        for to_raise, to_catch in matrix:
            test_exception_handling(to_raise, to_catch)

    def test_check_remote_resource_uuid(self):
        with self.assertRaises(TardisError):
            self.moab_adapter.check_remote_resource_uuid(
                AttributeDict(remote_resource_uuid=1),
                regex=r"^(\d)$",
                response="2")
Exemplo n.º 13
0
class TestSlurmAdapter(TestCase):
    mock_config_patcher = None
    mock_executor_patcher = None

    @classmethod
    def setUpClass(cls):
        cls.mock_config_patcher = patch(
            "tardis.adapters.batchsystems.slurm.Configuration"
        )
        cls.mock_config = cls.mock_config_patcher.start()
        cls.mock_executor_patcher = patch(
            "tardis.adapters.batchsystems.slurm.ShellExecutor"
        )
        cls.mock_executor = cls.mock_executor_patcher.start()

    @classmethod
    def tearDownClass(cls):
        cls.mock_config_patcher.stop()
        cls.mock_executor_patcher.stop()

    def setUp(self):
        self.cpu_ratio = 0.5
        self.memory_ratio = 0.25

        self.command = 'sinfo --Format="statelong,cpusstate,allocmem,memory,features,nodehost" -e --noheader -r --partition=test_part'  # noqa B950

        self.command_wo_options = 'sinfo --Format="statelong,cpusstate,allocmem,memory,features,nodehost" -e --noheader -r'  # noqa B950

        self.setup_config_mock(
            options=AttributeDict({"long": {"partition": "test_part"}})
        )

        self.slurm_adapter = SlurmAdapter()

    def tearDown(self):
        self.mock_executor.reset_mock()

    def setup_config_mock(self, options=None):
        self.config = self.mock_config.return_value
        self.config.BatchSystem.max_age = 10
        self.config.BatchSystem.executor = self.mock_executor.return_value
        if options:
            self.config.BatchSystem.options = options
        else:
            self.config.BatchSystem.options = {}

    def test_disintegrate_machine(self):
        self.assertIsNone(
            run_async(self.slurm_adapter.disintegrate_machine, drone_uuid="test")
        )

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_drain_machine(self):
        run_async(self.slurm_adapter.drain_machine, drone_uuid="VM-1")
        self.mock_executor.return_value.run_command.assert_called_with(
            "scontrol update NodeName=host-10-18-1-1 State=DRAIN Reason='COBalD/TARDIS'"
        )
        self.mock_executor.reset_mock()

        self.assertIsNone(
            run_async(self.slurm_adapter.drain_machine, drone_uuid="not_exists")
        )

    @mock_executor_run_command(
        stdout="",
        raise_exception=CommandExecutionFailure(
            message="Failed", stdout="Failed", stderr="Failed", exit_code=2
        ),
    )
    def test_update_exception(self):
        with self.assertLogs(level=logging.WARNING):
            self.assertIsNone(run_async(self.slurm_adapter._slurm_status.update_status))

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_drain_machine_without_options(self):
        self.setup_config_mock()
        self.slurm_adapter = SlurmAdapter()

        run_async(self.slurm_adapter.drain_machine, drone_uuid="VM-1")
        self.mock_executor.return_value.run_command.assert_called_with(
            "scontrol update NodeName=host-10-18-1-1 State=DRAIN Reason='COBalD/TARDIS'"
        )
        self.mock_executor.reset_mock()

    def test_integrate_machine(self):
        self.assertIsNone(
            run_async(self.slurm_adapter.integrate_machine, drone_uuid="VM-1")
        )

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_get_resource_ratios(self):
        self.assertEqual(
            list(run_async(self.slurm_adapter.get_resource_ratios, drone_uuid="VM-1")),
            [self.cpu_ratio, self.memory_ratio],
        )
        self.mock_executor.return_value.run_command.assert_called_with(self.command)
        self.mock_executor.reset_mock()

        self.assertEqual(
            run_async(self.slurm_adapter.get_resource_ratios, drone_uuid="not_exists"),
            {},
        )

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_get_resource_ratios_without_options(self):
        self.setup_config_mock()
        del self.config.BatchSystem.options
        self.slurm_adapter = SlurmAdapter()

        self.assertEqual(
            list(run_async(self.slurm_adapter.get_resource_ratios, drone_uuid="VM-1")),
            [self.cpu_ratio, self.memory_ratio],
        )

        self.mock_executor.return_value.run_command.assert_called_with(
            self.command_wo_options
        )

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_get_allocation(self):
        self.assertEqual(
            run_async(self.slurm_adapter.get_allocation, drone_uuid="VM-1"),
            max([self.cpu_ratio, self.memory_ratio]),
        )
        self.mock_executor.return_value.run_command.assert_called_with(self.command)

        self.assertEqual(
            run_async(self.slurm_adapter.get_allocation, drone_uuid="not_exists"),
            0.0,
        )

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_get_machine_status(self):
        state_mapping = {
            "VM-1": MachineStatus.Available,
            "not_exists": MachineStatus.NotAvailable,
            "draining_m": MachineStatus.Draining,
            "idle_m": MachineStatus.Available,
            "drained_m": MachineStatus.NotAvailable,
            "pwr_up_m": MachineStatus.NotAvailable,
        }

        for machine, state in state_mapping.items():
            self.assertEqual(
                run_async(self.slurm_adapter.get_machine_status, drone_uuid=machine),
                state,
            )

        self.mock_executor.reset_mock()

        self.mock_executor.return_value.run_command.side_effect = (
            CommandExecutionFailure(message="Test", exit_code=123, stderr="Test")
        )

        with self.assertLogs(level="WARN"):
            with self.assertRaises(CommandExecutionFailure):
                attributes = {
                    "Machine": "Machine",
                    "State": "State",
                    "Activity": "Activity",
                    "TardisDroneUuid": "TardisDroneUuid",
                }
                run_async(
                    partial(
                        slurm_status_updater,
                        self.config.BatchSystem.options,
                        attributes,
                        self.mock_executor.return_value,
                    )
                )
                self.mock_executor.return_value.run_command.assert_called_with(
                    self.command
                )

        self.mock_executor.return_value.run_command.side_effect = None

    @mock_executor_run_command(stdout=SINFO_RETURN)
    def test_get_utilisation(self):
        self.assertEqual(
            run_async(self.slurm_adapter.get_utilisation, drone_uuid="VM-1"),
            min([self.cpu_ratio, self.memory_ratio]),
        )
        self.mock_executor.return_value.run_command.assert_called_with(self.command)

        self.assertEqual(
            run_async(self.slurm_adapter.get_utilisation, drone_uuid="not_exists"),
            0.0,
        )

    def test_machine_meta_data_translation(self):
        self.assertEqual(
            AttributeDict(Cores=1, Memory=1000, Disk=1000),
            self.slurm_adapter.machine_meta_data_translation_mapping,
        )