def create_pod_config(self, sleep_time, dynamic_factor, host_limit_1=False):
        container_spec = pod.ContainerSpec(
            resource=pod.ResourceSpec(
                cpu_limit=0.1,
                mem_limit_mb=32,
                disk_limit_mb=32,
            ),
            command=mesos.CommandInfo(
                shell=True,
                value="echo %s && sleep %s"
                % (str(dynamic_factor), str(sleep_time)),
            ),
        )

        instance_label = v1alpha_peloton.Label(
            key="peloton/instance", value="instance-label"
        )
        host_limit_1_constraint = None
        if host_limit_1:
            host_limit_1_constraint = pod.Constraint(
                type=1,  # Label constraint
                label_constraint=pod.LabelConstraint(
                    kind=1,  # Label
                    condition=2,  # Equal
                    requirement=0,
                    label=instance_label,
                ),
            )

        containers = [container_spec]
        return pod.PodSpec(containers=containers,
                           labels=[instance_label],
                           constraint=host_limit_1_constraint)
예제 #2
0
def test__update_with_host_maintenance_and_agent_down(stateless_job,
                                                      maintenance):
    """
    1. Create a large stateless job (that take up more than two-thirds of
       the cluster resources) with MaximumUnavailableInstances=2.
    2. Start host maintenance on one of the hosts (say A) having pods of the job.
       MaximumUnavailableInstances=2 ensures that not more than 2 pods are
       unavailable due to host maintenance at a time.
    3. Take down another host which has pods running on it. This will TASK_LOST
       to be sent for all pods on the host after 75 seconds.
    4. Start an update to modify the instance spec of one of the pods.
    5. Since TASK_LOST would cause the job SLA to be violated, instances on the
       host A should not be killed once LOST event is received. Verify that
       host A does not transition to DOWN.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3
    stateless_job.job_spec.sla.maximum_unavailable_instances = 2
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    # pick another host which has pods of the job to take down
    host_container = get_container([sorted_hosts[1]])

    try:
        host_container.stop()
        maintenance["start"]([test_host])

        stateless_job.job_spec.instance_spec[10].containers.extend([
            pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec(
                disk_limit_mb=20))
        ])
        update = StatelessUpdate(stateless_job,
                                 updated_job_spec=stateless_job.job_spec,
                                 batch_size=0)
        update.create()
        update.wait_for_state(goal_state="SUCCEEDED")

        stateless_job.stop()

        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
        pass
    finally:
        host_container.start()
예제 #3
0
 def create_pod_config(self, sleep_time, dynamic_factor):
     container_spec = pod.ContainerSpec(
         resource=pod.ResourceSpec(
             cpu_limit=0.1, mem_limit_mb=32, disk_limit_mb=32
         ),
         command=mesos.CommandInfo(
             shell=True,
             value="echo %s && sleep %s"
             % (str(dynamic_factor), str(sleep_time)),
         ),
     )
     containers = [container_spec]
     return pod.PodSpec(containers=containers)
    def test__launch_kill(self):
        resource_constraint = self.new_resource_constraint(3.0)
        host_filter = v1hostmgr.HostFilter(
            resource_constraint=resource_constraint,
            max_hosts=1,
        )
        request = v1hostmgr_svc.AcquireHostsRequest(filter=host_filter)
        resp = self.client.v1hostmgr_svc.AcquireHosts(request,
                                                      metadata=self.metadata,
                                                      timeout=20)

        assert len(resp.hosts) == 1

        pod_spec = pod.PodSpec(
            pod_name=v1alpha.PodName(value='test-123'),
            containers=[
                pod.ContainerSpec(
                    name='c1',
                    image='alpine:3.6',
                    entrypoint=pod.CommandSpec(
                        value='/bin/sh',
                        arguments=[
                            '-c',
                            'while true; do echo OK && sleep 3; done',
                        ],
                    ),
                ),
            ],
        )
        pod_obj = v1hostmgr.LaunchablePod(
            pod_id=v1alpha.PodID(value='test-123'),
            spec=pod_spec,
        )
        req = v1hostmgr_svc.LaunchPodsRequest(
            lease_id=resp.hosts[0].lease_id,
            hostname=resp.hosts[0].host_summary.hostname,
            pods=[pod_obj],
        )

        self.client.v1hostmgr_svc.LaunchPods(
            req,
            metadata=self.metadata,
            timeout=20,
        )

        # the second launch pods call should fail.
        with pytest.raises(Exception):
            self.client.v1hostmgr_svc.LaunchPods(
                req,
                metadata=self.metadata,
                timeout=20,
            )

        req = v1hostmgr_svc.KillPodsRequest(pod_ids=[
            v1alpha.PodID(value='test-123'),
        ], )
        self.client.v1hostmgr_svc.KillPods(
            req,
            metadata=self.metadata,
            timeout=20,
        )