def test_match_double_queued_process(self): props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) r2 = ResourceRecord.new("r2", "n1", 1, properties=props) self.store.add_resource(r2) p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED) p2key = p2.get_key() self.store.add_process(p2) # enqueue p1 repeatedly. make sure that doesn't bomb anything self.store.enqueue_process(*p1key) self.store.enqueue_process(*p1key) self.store.enqueue_process(*p2key) self.store.enqueue_process(*p1key) self._run_in_thread() self.wait_process(p1.owner, p1.upid, lambda p: p.state == ProcessState.PENDING) self.wait_process(p2.owner, p2.upid, lambda p: p.state == ProcessState.PENDING)
def test_engine_types(self): self._run_in_thread() props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) constraints = {"engine": "engine2"} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # We don't have a resource that can run this yet timed_out = False try: self.wait_resource(r1.resource_id, lambda r: list(p1key) in r.assigned, timeout=2) except Exception: timed_out = True assert timed_out props = {"engine": "engine2"} r2 = ResourceRecord.new("r2", "n2", 1, properties=props) self.store.add_resource(r2) self.wait_resource(r2.resource_id, lambda r: list(p1key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p1, r2.resource_id) self.wait_process(p1.owner, p1.upid, lambda p: p.assigned == r2.resource_id and p.state == ProcessState.PENDING)
def test_node_exclusive_bug(self): """test_node_exclusive_bug If two processes with the same node exclusive attribute where scheduled in the same matchmaking cycle, they could be scheduled to the same resource, due to a caching issue. This test tests the fix. """ self.mm.initialize() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 2, properties=props) self.store.add_resource(r1) n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine1"} r2 = ResourceRecord.new("r2", "n2", 2, properties=props) self.store.add_resource(r2) xattr_1 = "port5000" constraints = {} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p2key = p2.get_key() self.store.add_process(p2) self.store.enqueue_process(*p2key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.mm.matchmake() # Ensure these processes are pending and scheduled to different nodes p1 = self.store.get_process(None, "p1") p2 = self.store.get_process(None, "p2") self.assertNotEqual(p1.assigned, p2.assigned)
def test_record_metadata(self): props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) r1.metadata['version'] = 0 r2 = ResourceRecord.new("r2", "n1", 1, properties=props) r2.metadata['version'] = 1 r1_dict_copy = dict(r1) r2_dict_copy = dict(r2) self.assertEqual(r1.metadata['version'], 0) self.assertEqual(r2.metadata['version'], 1) self.assertNotIn('metadata', r1_dict_copy) self.assertNotIn('metadata', r2_dict_copy)
def _first_heartbeat(self, sender, beat): node_id = beat.get('node_id') if not node_id: log.error("EE heartbeat from %s without a node_id!: %s", sender, beat) return node = self.store.get_node(node_id) if node is None: log.warn("EE heartbeat from unknown node. Still booting? " + "node_id=%s sender=%s.", node_id, sender) # TODO I'm thinking the best thing to do here is query EPUM # for the state of this node in case the initial node_state # update got lost. Note that we shouldn't go ahead and # schedule processes onto this EE until we get the RUNNING # node_state update -- there could be a failure later on in # the contextualization process that triggers the node to be # terminated. return if node.properties: properties = node.properties.copy() else: properties = {} log.info("First heartbeat from EEAgent %s on node %s (%s)", sender, node_id, properties.get("hostname", "unknown hostname")) try: engine_id = engine_id_from_domain(node.domain_id) except ValueError: log.exception("Node for EEagent %s has invalid domain_id!", sender) return engine_spec = self.get_engine(engine_id) slots = engine_spec.slots # just making engine type a generic property/constraint for now, # until it is clear something more formal is needed. properties['engine'] = engine_id try: self.node_add_resource(node, sender) except NotFoundError: log.warn("Node removed while processing heartbeat. ignoring. " "node_id=%s sender=%s.", node_id, sender) return timestamp_str = beat['timestamp'] timestamp = ceiling_datetime(parse_datetime(timestamp_str)) resource = ResourceRecord.new(sender, node_id, slots, properties) resource.new_last_heartbeat_datetime(timestamp) try: self.store.add_resource(resource) except WriteConflictError: # no problem if this resource was just created by another worker log.info("Conflict writing new resource record %s. Ignoring.", sender)
def create_engine_resources(self, engine_id, node_count=1, assignments=None): engine_spec = self.registry.get_engine_by_id(engine_id) assert len(assignments) <= engine_spec.slots * engine_spec.replicas * node_count records = [] for i in range(node_count): node_id = uuid.uuid4().hex props = {"engine": engine_id} for i in range(engine_spec.replicas): res = ResourceRecord.new(uuid.uuid4().hex, node_id, engine_spec.slots, properties=props) records.append(res) res.metadata['version'] = 0 self.mm.resources[res.resource_id] = res # use fake process ids in the assigned list, til it matters if len(assignments) <= engine_spec.slots: res.assigned = list(assignments) assignments = [] else: res.assigned = assignments[:engine_spec.slots] assignments[:] = assignments[engine_spec.slots:] print "added resource: %s" % res return records
def test_match_process_terminated(self): self.mm.initialize() props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() # now update the process record to be TERMINATED so that # MM should bail out of matching this process p1.state = ProcessState.TERMINATED self.store.update_process(p1) self.store.remove_queued_process(*p1key) self.mm.matchmake() p1 = self.store.get_process(None, "p1") self.assertEqual(p1.state, ProcessState.TERMINATED)
def test_resource_record(self): props = {"engine": "engine1", "resource_id": "r1"} r = ResourceRecord.new("r1", "n1", 1, properties=props) self.assertEqual(r.available_slots, 1) self.assertEqual(r.properties, props) r.assigned.append('proc1') self.assertEqual(r.available_slots, 0)
def test_match_writeconflict(self): self.mm.initialize() props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() # now update the resource record so the matchmake() attempt to write will conflict r1.assigned = ["hats"] self.store.update_resource(r1) # this should bail out without resetting the needs_matchmaking flag # or registering any need self.assertTrue(self.mm.needs_matchmaking) self.mm.matchmake() self.assertFalse(self.epum_client.reconfigures) self.assertTrue(self.mm.needs_matchmaking) r1copy = self.store.get_resource(r1.resource_id) self.assertRecordVersions(r1, r1copy)
def test_process_already_assigned(self): # this is a recovery situation, probably. The process is assigned # to a resource already at the start of the matchmaker run, but # the process record hasn't been updated to reflect that. The # situation should be detected and handled by matchmaker. p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) self.store.add_process(p1) self.store.enqueue_process(*p1.key) props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) r1.assigned.append(p1.key) self.store.add_resource(r1) self._run_in_thread() self.wait_process(None, "p1", lambda p: p.state == ProcessState.PENDING) r1 = self.store.get_resource("r1") self.assertEqual(len(r1.assigned), 1) self.assertTrue(r1.is_assigned(p1.owner, p1.upid, p1.round)) self.assertEqual(r1.available_slots, 0)
def test_stale_procs(self): """test that the matchmaker doesn't try to schedule stale procs A stale proc is one that the matchmaker has attempted to scale before while the state have the resources hasn't changed. """ if not os.environ.get('INT'): raise SkipTest("Skip slow integration test") self.mm.initialize() p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.assertTrue(self.mm.needs_matchmaking) self.mm.matchmake() self.assertFalse(self.mm.needs_matchmaking) self.assertTrue(len(self.mm.stale_processes) > 0) self.mm._get_queued_processes() self.mm._get_resource_set() self.assertFalse(self.mm.needs_matchmaking) self.assertTrue(len(self.mm.stale_processes) > 0) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED) p2key = p2.get_key() self.store.add_process(p2) self.store.enqueue_process(*p2key) self.mm._get_queued_processes() self.mm._get_resource_set() self.assertTrue(self.mm.needs_matchmaking) self.assertTrue(len(self.mm.stale_processes) > 0) self.assertTrue(len(self.mm.queued_processes) > len(self.mm.stale_processes)) self.mm.matchmake() self.assertFalse(self.mm.needs_matchmaking) self.assertTrue(len(self.mm.queued_processes) == len(self.mm.stale_processes)) # Add a resource, and ensure that stale procs get dumped props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) self.mm._get_queued_processes() self.mm._get_resources() self.mm._get_resource_set() self.assertTrue(len(self.mm.stale_processes) == 0)
def test_node_filo(self): """test_node_filo We prioritize shutting down the newest VMs as a workaround for OOI Testing strategy """ self.mm.initialize() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine4"} r1 = ResourceRecord.new("r1", "n1", 2, properties=props) self.store.add_resource(r1) n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine4"} r2 = ResourceRecord.new("r2", "n2", 2, properties=props) self.store.add_resource(r2) constraints = {"engine": "engine4"} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.mm.register_needs() self.epum_client.clear() self.mm.queued_processes = [] self.mm.register_needs() conf = self.epum_client.reconfigures['pd_domain_engine4'][0] retired_nodes = conf['engine_conf']['retirable_nodes'] assert len(retired_nodes) == 1 # This should be the second node we started assert retired_nodes[0] == "n2"
def test_wait_resource(self): props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) self.wait_resource("r1", lambda r: r.resource_id == "r1") def makeitso(): r1.slot_count = 2 self.store.update_resource(r1) tevent.spawn(makeitso) self.wait_resource("r1", lambda r: r.slot_count == 2)
def test_queueing_order(self): self._run_in_thread() procnames = [] # queue 10 processes for i in range(10): proc = ProcessRecord.new(None, "proc" + str(i), get_process_definition(), ProcessState.REQUESTED) prockey = proc.key self.store.add_process(proc) self.store.enqueue_process(*prockey) self.epum_client.clear() self.wait_process(proc.owner, proc.upid, lambda p: p.state == ProcessState.WAITING) procnames.append(proc.upid) # potentially retry a few times to account for race between process # state updates and need reconfigures for i in range(5): try: self.assert_one_reconfigure(preserve_n=i + 1, retirees=[]) break except AssertionError: time.sleep(0.01) self.epum_client.clear() # now add 10 resources each with 1 slot. processes should start in order for i in range(10): props = {"engine": "engine1"} res = ResourceRecord.new("res" + str(i), "node" + str(i), 1, properties=props) self.store.add_resource(res) self.wait_process(None, procnames[i], lambda p: p.state >= ProcessState.PENDING and p.assigned == res.resource_id) # finally doublecheck that launch requests happened in order too for i in range(5): try: self.assertEqual(self.resource_client.launch_count, 10) for i, launch in enumerate(self.resource_client.launches): self.assertEqual(launch[0], "res" + str(i)) self.assertEqual(launch[1], "proc" + str(i)) break except AssertionError: time.sleep(0.01)
def test_process_terminated(self): self._run_in_thread() event = threading.Event() # we set up a resource and a matching process that should be assigned # to it. we will simulate marking the process TERMINATED out-of-band # and ensure that is recognized before the dispatch. # when the matchmaker attempts to update the process, sneak in an update # first so the matchmaker request conflicts original_update_process = self.store.update_process def patched_update_process(process): original = self.store.get_process(process.owner, process.upid) original.state = ProcessState.TERMINATED original_update_process(original) try: original_update_process(process) finally: event.set() self.store.update_process = patched_update_process p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) self.store.add_process(p1) self.store.enqueue_process(*p1.key) # now give it a resource. it should be matched but in the meantime # the process will be terminated props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) # wait for MM to hit our update conflict, kill it, and check that it # appropriately backed out the allocation assert event.wait(5) self.mm.cancel() self.mmthread.join() self.mmthread = None resource = self.store.get_resource("r1") self.assertEqual(len(resource.assigned), 0) self.assertEqual(self.resource_client.launch_count, 0)
def test_match_copy_hostname(self): self._run_in_thread() props = {"engine": "engine1", "hostname": "vm123"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) self.wait_process(p1.owner, p1.upid, lambda p: p.assigned == r1.resource_id and p.state == ProcessState.PENDING) p1 = self.store.get_process(None, "p1") self.assertEqual(p1.hostname, "vm123")
def test_disabled_resource(self): self._run_in_thread() props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) r1.state = ExecutionResourceState.DISABLED self.store.add_resource(r1) self.wait_resource("r1", lambda r: r.resource_id == "r1") p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.key self.store.add_process(p1) self.store.enqueue_process(*p1key) # the resource matches but it is disabled, process should # remain in the queue self.wait_process(p1.owner, p1.upid, lambda p: p.state == ProcessState.WAITING)
def test_match1(self): self._run_in_thread() props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) self.wait_resource(r1.resource_id, lambda r: list(p1key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p1, r1.resource_id) self.wait_process(p1.owner, p1.upid, lambda p: p.assigned == r1.resource_id and p.state == ProcessState.PENDING)
def test_waiting(self): self._run_in_thread() # not-immediate process enqueued while there are no resources p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) self.wait_process(None, "p1", lambda p: p.state == ProcessState.WAITING) # now give it a resource. it should be scheduled props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) self.wait_resource(r1.resource_id, lambda r: list(p1key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p1, r1.resource_id)
def test_stale_optimization(self): # DL: not sure this test is really relevant anymore. It often fails # against zookeeper because the ratio isn't as good. raise SkipTest("Skip manual optimization test") from time import clock self.mm.initialize() n_start_proc = 10000 for i in range(0, n_start_proc): p = ProcessRecord.new(None, "p%s" % i, get_process_definition(), ProcessState.REQUESTED) pkey = p.get_key() self.store.add_process(p) self.store.enqueue_process(*pkey) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.assertTrue(self.mm.needs_matchmaking) unoptimized_start = clock() self.mm.matchmake() unoptimized_end = clock() unoptimized_time = unoptimized_end - unoptimized_start self.assertFalse(self.mm.needs_matchmaking) self.assertTrue(len(self.mm.stale_processes) > 0) p = ProcessRecord.new(None, "px", get_process_definition(), ProcessState.REQUESTED) pkey = p.get_key() self.store.add_process(p) self.store.enqueue_process(*pkey) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.assertTrue(self.mm.needs_matchmaking) optimized_start = clock() self.mm.matchmake() optimized_end = clock() optimized_time = optimized_end - optimized_start if optimized_time > 0: ratio = unoptimized_time / optimized_time print "Unoptimised Time: %s Optimised Time: %s ratio: %s" % ( unoptimized_time, optimized_time, ratio) self.assertTrue(ratio >= 100, "Our optimized matchmake didn't have a 100 fold improvement") else: print "optimized_time was zero. hmm" # Add a resource, and ensure that matchmake time is unoptimized props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 1, properties=props) self.store.add_resource(r1) self.mm._get_queued_processes() self.mm._get_resources() self.mm._get_resource_set() self.assertTrue(self.mm.needs_matchmaking) addresource_start = clock() self.mm.matchmake() addresource_end = clock() addresource_time = addresource_end - addresource_start optimized_addresource_ratio = unoptimized_time / addresource_time print "Add resource ratio: %s" % optimized_addresource_ratio msg = "After adding a resource, matchmaking should be of the same order" self.assertTrue(optimized_addresource_ratio < 10, msg)
def test_node_exclusive(self): self._run_in_thread() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine1"} n1_r1 = ResourceRecord.new("n1_r1", "n1", 2, properties=props) self.store.add_resource(n1_r1) n1_r2 = ResourceRecord.new("n1_r2", "n1", 2, properties=props) self.store.add_resource(n1_r2) xattr_1 = "port5000" constraints = {} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # The first process should be assigned, since nothing else needs this # attr # TODO: it's possible that this could be assigned to n1_r2, but hopefully not self.wait_resource(n1_r1.resource_id, lambda r: list(p1key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p1, n1_r1.resource_id) self.wait_process(p1.owner, p1.upid, lambda p: p.assigned == n1_r1.resource_id and p.state == ProcessState.PENDING) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p2key = p2.get_key() self.store.add_process(p2) self.store.enqueue_process(*p2key) # The second process should wait, since first process wants this attr # as well self.wait_process(p2.owner, p2.upid, lambda p: p.state == ProcessState.WAITING) # If we start another node, we should see that second process be # scheduled n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine1"} n2_r1 = ResourceRecord.new("n2_r1", "n2", 2, properties=props) self.store.add_resource(n2_r1) props = {"engine": "engine1"} n2_r2 = ResourceRecord.new("n2_r2", "n2", 2, properties=props) self.store.add_resource(n2_r2) # The second process should now be assigned self.wait_resource(n2_r1.resource_id, lambda r: list(p2key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p2, n2_r1.resource_id) self.wait_process(p2.owner, p2.upid, lambda p: p.assigned == n2_r1.resource_id and p.state == ProcessState.PENDING) # Now we submit another process with a different exclusive attribute # It should be assigned right away xattr_2 = "port5001" constraints = {} p3 = ProcessRecord.new(None, "p3", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_2) p3key = p3.get_key() self.store.add_process(p3) self.store.enqueue_process(*p3key) p3_resource = None for resource in [n1_r1, n1_r2, n2_r1, n2_r2]: try: self.wait_resource(resource.resource_id, lambda r: list(p3key) in r.assigned, timeout=0.5) except Exception: continue time.sleep(0.05) self.resource_client.check_process_launched(p3, resource.resource_id) self.wait_process(p3.owner, p3.upid, lambda p: p.assigned == resource.resource_id and p.state == ProcessState.PENDING) p3_resource = resource self.assertIsNotNone(p3_resource) # Now submit a fourth process, which should be scheduled to a different # node from p3 p4 = ProcessRecord.new(None, "p4", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_2) p4key = p4.get_key() self.store.add_process(p4) self.store.enqueue_process(*p4key) p4_resource = None for resource in [n1_r1, n1_r2, n2_r1, n2_r2]: try: self.wait_resource(resource.resource_id, lambda r: list(p4key) in r.assigned, timeout=0.5) except Exception: continue time.sleep(0.05) self.resource_client.check_process_launched(p4, resource.resource_id) self.wait_process(p4.owner, p4.upid, lambda p: p.assigned == resource.resource_id and p.state == ProcessState.PENDING) p4_resource = resource self.assertIsNotNone(p4_resource) self.assertNotEqual(p3_resource.node_id, p4_resource.node_id)