示例#1
0
def test__acquire_release_host_offers():
    resource_constraint = v0hostmgr.ResourceConstraint(
        minimum=task.ResourceConfig(cpuLimit=3.0))
    host_filter = v0hostmgr.HostFilter(
        resourceConstraint=resource_constraint,
        quantity=v0hostmgr.QuantityControl(maxHosts=2),
    )
    request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter)
    client = with_private_stubs(Client())

    resp = client.hostmgr_svc.AcquireHostOffers(
        request, metadata=client.hostmgr_metadata, timeout=20)

    # max hosts is 2, we should expect 2 host offers
    assert len(resp.hostOffers) == 2
    for offer in resp.hostOffers:
        assert (offer.hostname in MESOS_AGENTS)

    # release offers to hostmgr
    resp = client.hostmgr_svc.ReleaseHostOffers(
        request=v0hostmgr.ReleaseHostOffersRequest(hostOffers=resp.hostOffers),
        metadata=client.hostmgr_metadata,
        timeout=20)

    assert resp.HasField("error") is False
示例#2
0
def cleanup_stateless_jobs(timeout_secs=10, client=None):
    """
    delete all service jobs from minicluster
    """
    client = client or Client()
    jobs = stateless_query_jobs(client=client)

    # opportunistic delete for jobs, if not deleted within
    # timeout period, it will get cleanup in next test run.
    stateless_delete_jobs(jobs)

    # Wait for job deletion to complete.
    deadline = time.time() + timeout_secs
    while time.time() < deadline:
        try:
            jobs = stateless_query_jobs()
            if len(jobs) == 0:
                return
            time.sleep(1)
        except grpc.RpcError as e:
            # Catch "not-found" error here because QueryJobs endpoint does
            # two db queries in sequence: "QueryJobs" and "GetUpdate".
            # However, when we delete a job, updates are deleted first,
            # there is a slight chance QueryJobs will fail to query the
            # update, returning "not-found" error.
            if e.code() == grpc.StatusCode.NOT_FOUND:
                time.sleep(1)
                continue
示例#3
0
def test__delete_active_job(jobs_by_state):
    job = jobs_by_state[1]['RUNNING'][0]
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    client = Client()
    request = job_pb2.DeleteRequest(
        id=peloton.JobID(value=job.job_id),
    )
    failed = True
    try:
        client.job_svc.Delete(
            request,
            metadata=client.jobmgr_metadata,
            timeout=10,
        )
        failed = False
    except grpc.RpcError as e:
        log.info(e)
        errmsg = "Job is not in a terminal state"
        assert errmsg in e.details()
        assert e.code() is grpc.StatusCode.INTERNAL
    job.stop()
    job.wait_for_state(goal_state='KILLED')
    assert failed is True
def query_response(request):
    """
    Invokes task query API, and returns a task.QueryRequest object
    from api response.
    """
    client = Client()
    task_query_response = client.task_svc.Query(
        request, metadata=client.jobmgr_metadata, timeout=10)
    return task_query_response
示例#5
0
def query_by_spec(respool_id=None, spec=None):
    client = Client()
    request = job_pb2.QueryRequest(respoolID=respool_id,
                                   spec=spec,
                                   summaryOnly=True)
    resp = client.job_svc.Query(request,
                                metadata=client.jobmgr_metadata,
                                timeout=10)
    return resp
示例#6
0
def test__delete_completed_job(jobs_by_state):

    job = jobs_by_state[1]["SUCCEEDED"][0]
    job.create()
    job.wait_for_state()

    client = Client()
    request = job_pb2.DeleteRequest(id=peloton.JobID(value=job.job_id))
    try:
        client.job_svc.Delete(request,
                              metadata=client.jobmgr_metadata,
                              timeout=10)
    except grpc.RpcError as e:
        log.info(e)
        assert e is None
示例#7
0
def test__delete_non_existing_job():

    client = Client()
    request = job_pb2.DeleteRequest(id=peloton.JobID(
        value="00010203-0405-0607-0809-0a0b0c0d0e0f"))
    failed = True
    try:
        client.job_svc.Delete(request,
                              metadata=client.jobmgr_metadata,
                              timeout=10)
        failed = False
    except grpc.RpcError as e:
        log.info(e)
        assert e.details() == "job not found"
        assert e.code() is grpc.StatusCode.NOT_FOUND
    assert failed is True
示例#8
0
def test__cluster_capacity():
    # get cluster capacity
    client = with_private_stubs(Client())
    resp = client.hostmgr_svc.ClusterCapacity(
        request=v0hostmgr.ClusterCapacityRequest(),
        metadata=client.hostmgr_metadata,
        timeout=20)
    assert resp.HasField("error") is False

    # check capacity
    for resource in resp.physicalResources:
        assert resource.kind in ['cpu', 'gpu', 'memory', 'disk', 'fd']
        if resource.kind == 'cpu':
            assert resource.capacity == 12.0  # 4cpu * 3 agents
        if resource.kind == 'memory':
            assert resource.capacity == 6144.0  # 2048Mb * 3 agents
示例#9
0
def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines):
    # Tests that a if task is deleted which is in READY state in resource
    # manager and if is re-enqueued succeeds.

    # stop the placement engines to keep the tasks in READY state
    placement_engines.stop()

    # decorate the client to add peloton private API stubs
    c = with_private_stubs(Client())

    # create long running job with 2 instances
    long_running_job = Job(
        job_file='long_running_job.yaml',
        options=[
            with_instance_count(2),
        ],
        client=c,
    )

    long_running_job.create()
    long_running_job.wait_for_state(goal_state='PENDING')

    task = long_running_job.get_task(0)
    # wait for task to reach READY
    task.wait_for_pending_state(goal_state='READY')

    # kill the task
    task.stop()

    # re-enqueue the task
    task.start()

    # gentlemen, start your (placement) engines
    placement_engines.start()

    def wait_for_instance_to_run():
        return long_running_job.get_task(0).state_str == 'RUNNING'

    long_running_job.wait_for_condition(wait_for_instance_to_run)
示例#10
0
def test__acquire_return_offers_errors():
    resource_constraint = v0hostmgr.ResourceConstraint(
        minimum=task.ResourceConfig(cpuLimit=14.0))
    host_filter = v0hostmgr.HostFilter(resourceConstraint=resource_constraint)
    request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter)
    # decorate the client to add peloton private API stubs
    client = with_private_stubs(Client())

    # ask is 14 cpus, so no hosts should match this
    resp = client.hostmgr_svc.AcquireHostOffers(
        request, metadata=client.hostmgr_metadata, timeout=20)
    assert len(resp.hostOffers) == 0

    # release offers to hostmgr with a invalid offer ID
    resp = client.hostmgr_svc.ReleaseHostOffers(
        request=v0hostmgr.ReleaseHostOffersRequest(hostOffers=[
            v0hostmgr.HostOffer(id=peloton.HostOfferID(value="invalid_id"))
        ]),
        metadata=client.hostmgr_metadata,
        timeout=20)

    assert resp.error is not None
def test__tasks_reserve_execution(hostreservepool):
    p_job_median = Job(
        job_file='test_hostreservation_job_median.yaml',
        pool=hostreservepool,
        config=IntegrationTestConfig(
            max_retry_attempts=100,
            sleep_time_sec=1),
    )

    p_job_median.create()
    p_job_median.wait_for_state(goal_state='RUNNING')

    # we should have all 3 tasks in running state
    def all_running():
        return all(t.state == task.RUNNING for t in p_job_median.get_tasks().values())

    p_job_median.wait_for_condition(all_running)

    # decorate the client to add peloton private API stubs
    client = with_private_stubs(Client())

    p_job_large = Job(
        job_file='test_hostreservation_job_large.yaml',
        pool=hostreservepool,
        config=IntegrationTestConfig(
            sleep_time_sec=1,
            max_retry_attempts=300),
        options=[with_instance_count(1)],
        client=client,
    )
    p_job_large.create()
    p_job_large.wait_for_state(goal_state='PENDING')

    request = hostmgr.GetHostsByQueryRequest()

    # task should get into reserved state and RUNNING state
    t1 = p_job_large.get_task(0)
    t1.wait_for_pending_state(goal_state="RESERVED")

    # the task is running on reserved host
    def get_reserved_host():
        resp = client.hostmgr_svc.GetHostsByQuery(
            request,
            metadata=p_job_large.client.hostmgr_metadata,
            timeout=p_job_large.config.rpc_timeout_sec,)

        for h in resp.hosts:
            if h.status == 'reserved':
                return h.hostname
        return ''

    def is_reserved():
        return get_reserved_host() != ''

    p_job_large.wait_for_condition(is_reserved)
    reserved_host = get_reserved_host()

    t1.wait_for_pending_state(goal_state="RUNNING")
    assert reserved_host == t1.get_info().runtime.host

    # p_job_large should succeed
    p_job_large.wait_for_state()

    # no host is in reserved state
    response = client.hostmgr_svc.GetHostsByQuery(
        request,
        metadata=p_job_large.client.hostmgr_metadata,
        timeout=p_job_large.config.rpc_timeout_sec,)
    for host in response.hosts:
        assert host.status != 'reserved'

    kill_jobs([p_job_median, p_job_large])
 def client(self):
     return with_private_stubs(Client())
示例#13
0
def test__launch_kill():
    client = with_private_stubs(Client())

    # acquire 1 host offer
    resource_constraint = v0hostmgr.ResourceConstraint(
        minimum=task.ResourceConfig(cpuLimit=3.0))
    host_filter = v0hostmgr.HostFilter(
        resourceConstraint=resource_constraint,
        quantity=v0hostmgr.QuantityControl(maxHosts=1),
    )
    request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter, )

    resp = client.hostmgr_svc.AcquireHostOffers(
        request, metadata=client.hostmgr_metadata, timeout=20)

    assert len(resp.hostOffers) == 1

    # launch a test task using this offer
    cmd = "echo 'succeeded instance task' & sleep 100"
    tc = task.TaskConfig(
        command=mesos.CommandInfo(shell=True, value=cmd),
        name="task_name",
        resource=task.ResourceConfig(cpuLimit=1.0),
    )
    tid = mesos.TaskID(value=str(uuid.uuid4()) + '-1-1')
    t = v0hostmgr.LaunchableTask(
        taskId=tid,
        config=tc,
    )

    # Test 1
    # launch task using invalid offer
    req = v0hostmgr.LaunchTasksRequest(
        hostname=resp.hostOffers[0].hostname,
        agentId=resp.hostOffers[0].agentId,
        tasks=[t],
        id=peloton.HostOfferID(value=str(uuid.uuid4())))
    try:
        resp = client.hostmgr_svc.LaunchTasks(req,
                                              metadata=client.hostmgr_metadata,
                                              timeout=20)
        assert False, 'LaunchTasks should have failed'
    except:
        pass

    # Test 2
    # launch task using valid offer
    req = v0hostmgr.LaunchTasksRequest(hostname=resp.hostOffers[0].hostname,
                                       agentId=resp.hostOffers[0].agentId,
                                       tasks=[t],
                                       id=resp.hostOffers[0].id)
    resp = client.hostmgr_svc.LaunchTasks(req,
                                          metadata=client.hostmgr_metadata,
                                          timeout=20)
    assert resp.HasField("error") is False

    # Test 3
    # kill with empty TaskIDs list
    resp = client.hostmgr_svc.KillTasks(v0hostmgr.KillTasksRequest(taskIds=[]),
                                        metadata=client.hostmgr_metadata,
                                        timeout=20)
    assert resp.HasField("error") is True

    # Test 4
    # kill valid TaskID
    resp = client.hostmgr_svc.KillTasks(
        v0hostmgr.KillTasksRequest(taskIds=[tid]),
        metadata=client.hostmgr_metadata,
        timeout=20)
    assert resp.HasField("error") is False