def on_step_end(self, iteration, trials, **info): num_finished = len([ t for t in trials if t.status == Trial.TERMINATED or t.status == Trial.ERROR ]) num_running = len([t for t in trials if t.status == Trial.RUNNING]) num_staging = sum( len(s) for s in trial_executor._pg_manager._staging.values()) num_ready = sum( len(s) for s in trial_executor._pg_manager._ready.values()) num_in_use = len(trial_executor._pg_manager._in_use_pgs) num_cached = len(trial_executor._pg_manager._cached_pgs) total_num_tracked = num_staging + num_ready + \ num_in_use + num_cached num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) num_removal_scheduled_pgs = len( trial_executor._pg_manager._pgs_for_removal) # All 3 trials (3 different learning rates) should be scheduled. assert 3 == min(3, len(trials)) # Cannot run more than 2 at a time # (due to different resource restrictions in the test cases). assert num_running <= 2 # The number of placement groups should decrease # when trials finish. assert max(3, len(trials)) - num_finished == total_num_tracked # The number of actual placement groups should match this. assert max(3, len(trials)) - num_finished == \ num_non_removed_pgs - num_removal_scheduled_pgs
def schedule_fold_model_fit(self, model_base, fold_ctx, kwargs): args = [model_base, fold_ctx, kwargs] args_refs = [ray.put(arg) for arg in args] print('...model_fit') pg = placement_group([{"CPU": 2}], strategy="STRICT_SPREAD") ray.get(pg.ready()) print(placement_group_table(pg)) results_ref = model_fit_task_ray.options(placement_group=pg).remote( *args_refs) self.jobs.append((results_ref, time_start_fold, on_fit_end_fn))
def on_step_end(self, iteration, trials, **info): num_finished = len([ t for t in trials if t.status == Trial.TERMINATED or t.status == Trial.ERROR ]) num_staging = sum( len(s) for s in trial_executor._pg_manager._staging.values()) num_ready = sum( len(s) for s in trial_executor._pg_manager._ready.values()) num_in_use = len(trial_executor._pg_manager._in_use_pgs) num_cached = len(trial_executor._pg_manager._cached_pgs) total_num_tracked = num_staging + num_ready + num_in_use + num_cached num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) num_removal_scheduled_pgs = len( trial_executor._pg_manager._pgs_for_removal) # All trials should be scheduled this.assertEqual( scheduled, min(scheduled, len(trials)), msg=f"Num trials iter {iteration}", ) # The following two tests were relaxed for reuse_actors=True # so that up to `max_num_parallel` more placement groups can # exist than we would expect. This is because caching # relies on reconciliation for cleanup to avoid overscheduling # of new placement groups. num_parallel_reuse = int(reuse_actors) * max_num_parallel # The number of PGs should decrease when trials finish this.assertGreaterEqual( max(scheduled, len(trials)) - num_finished + num_parallel_reuse, total_num_tracked, msg=f"Num tracked iter {iteration}", ) # The number of actual placement groups should match this this.assertGreaterEqual( max(scheduled, len(trials)) - num_finished + num_parallel_reuse, num_non_removed_pgs - num_removal_scheduled_pgs, msg=f"Num actual iter {iteration}", )
def _assertCleanup(self, trial_executor): # Assert proper cleanup pg_manager = trial_executor._pg_manager self.assertFalse(pg_manager._in_use_trials) self.assertFalse(pg_manager._in_use_pgs) self.assertFalse(pg_manager._staging_futures) for pgf in pg_manager._staging: self.assertFalse(pg_manager._staging[pgf]) for pgf in pg_manager._ready: self.assertFalse(pg_manager._ready[pgf]) num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) self.assertEqual(num_non_removed_pgs, 0)
def on_step_end(self, iteration, trials, **info): num_finished = len([ t for t in trials if t.status == Trial.TERMINATED or t.status == Trial.ERROR ]) num_staging = sum( len(s) for s in trial_executor._pg_manager._staging.values()) num_ready = sum( len(s) for s in trial_executor._pg_manager._ready.values()) num_in_use = len(trial_executor._pg_manager._in_use_pgs) num_cached = len(trial_executor._pg_manager._cached_pgs) total_num_tracked = num_staging + num_ready + \ num_in_use + num_cached num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) num_removal_scheduled_pgs = len( trial_executor._pg_manager._pgs_for_removal) # All trials should be scheduled this.assertEqual( scheduled, min(scheduled, len(trials)), msg=f"Num trials iter {iteration}") # The number of PGs should decrease when trials finish this.assertEqual( max(scheduled, len(trials)) - num_finished, total_num_tracked, msg=f"Num tracked iter {iteration}") # The number of actual placement groups should match this this.assertEqual( max(scheduled, len(trials)) - num_finished, num_non_removed_pgs - num_removal_scheduled_pgs, msg=f"Num actual iter {iteration}")