def run_task_workload(total_num_cpus, smoke): """Run task-based workload that doesn't require object reconstruction.""" @ray.remote(num_cpus=1, max_retries=-1) def task(): def generate_data(size_in_kb=10): return np.zeros(1024 * size_in_kb, dtype=np.uint8) a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return generate_data(size_in_kb=50) @ray.remote(num_cpus=1, max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) multiplier = 75 # For smoke mode, run less number of tasks if smoke: multiplier = 1 TOTAL_TASKS = int(total_num_cpus * 2 * multiplier) pb = ProgressBar("Chaos test", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] pb.block_until_complete(results) pb.close() # Consistency check. wait_for_condition( lambda: (ray.cluster_resources().get("CPU", 0) == ray. available_resources().get("CPU", 0)), timeout=60, )
def main(): """The test simulates the workload with many threaded actors. Test is doing 4 things for 1 hour. - It first creates actors as many as num_cpus with max_concurrency=10 - Each actor computes pi and put the result to the queue. - Driver keeps getting result & metadata from the actor. - Every X seconds, it kills all actors and restarts them. """ ray.init(address="auto") args, unknown = parse_script_args() num_cpus = ray.cluster_resources()["CPU"] num_nodes = sum(1 for n in ray.nodes() if n["Alive"]) print(f"Total number of actors: {num_cpus}, nodes: {num_nodes}") monitor_actor = monitor_memory_usage() start = time.time() while time.time() - start < args.test_runtime: # Step 1: Create actors and start computation loop. print("Create actors.") actors = start_actors(num_cpus, num_nodes) # Step 2: Get the pi result from actors. compute_start = time.time() print("Start computation.") while time.time() - compute_start < args.kill_interval_s: # Get the metadata. ray.get([actor.get_metadata.remote() for actor in actors]) # Get the result. pb = ProgressBar("Computing Pi", num_cpus) results = [actor.get_pi.remote() for actor in actors] pb.fetch_until_complete(results) pb.close() # Step 3: Kill actors. print("Kill all actors.") for actor in actors: ray.kill(actor) # Report the result. print("PASSED.") used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote()) print("Memory usage with failures.") print(f"Peak memory usage: {round(used_gb, 2)}GB") print(f"Peak memory usage per processes:\n {usage}") # Report the result. ray.get(monitor_actor.stop_run.remote()) result = {"success": 0} with open(os.environ["TEST_OUTPUT_JSON"], "w") as f: f.write(json.dumps(result))
def _execute_reduce_stage( self, output_num_blocks: int, schedule: _PushBasedShuffleTaskSchedule, reduce_ray_remote_args: Dict[str, Any], all_merge_results: List[List[ObjectRef]], ): shuffle_reduce = cached_remote_fn(self.reduce) # Execute the final reduce stage. shuffle_reduce_out = [] for reducer_idx in range(output_num_blocks): merge_idx = schedule.get_merge_idx_for_reducer_idx(reducer_idx) # Submit one partition of reduce tasks, one for each of the P # outputs produced by the corresponding merge task. # We also add the merge task arguments so that the reduce task # is colocated with its inputs. shuffle_reduce_out.append( shuffle_reduce.options( **reduce_ray_remote_args, **schedule.get_merge_task_options(merge_idx), num_returns=2, ).remote( *self._reduce_args, *[ merge_results.pop(0) for merge_results in all_merge_results[merge_idx] ], ) ) for merge_idx, merge_results in enumerate(all_merge_results): assert all(len(merge_result) == 0 for merge_result in merge_results), ( "Reduce stage did not process outputs from merge tasks at index: " f"{merge_idx}" ) assert ( len(shuffle_reduce_out) == output_num_blocks ), f"Expected {output_num_blocks} outputs, produced {len(shuffle_reduce_out)}" reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) reduce_blocks, reduce_metadata = zip(*shuffle_reduce_out) reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata)) reduce_bar.close() return reduce_metadata, reduce_blocks
def test_chaos_actor_retry(set_kill_interval): # Chaos testing. @ray.remote(num_cpus=0.25, max_restarts=-1, max_task_retries=-1) class Actor: def __init__(self): self.letter_dict = set() def add(self, letter): self.letter_dict.add(letter) NUM_CPUS = 16 TOTAL_TASKS = 300 pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS * NUM_CPUS) actors = [Actor.remote() for _ in range(NUM_CPUS)] results = [] for a in actors: results.extend([a.add.remote(str(i)) for i in range(TOTAL_TASKS)]) start = time.time() pb.fetch_until_complete(results) runtime_with_failure = time.time() - start print(f"Runtime when there are many failures: {runtime_with_failure}") pb.close()
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ # TODO(Clark): Support multiple boundary sampling keys. if isinstance(key, list) and len(key) > 1: raise ValueError("Multiple boundary sampling keys not supported.") n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) samples = sample_bar.fetch_until_complete(sample_results) sample_bar.close() del sample_results samples = [s for s in samples if len(s) > 0] # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingBlockBuilder() for sample in samples: builder.add_block(sample) samples = builder.build() column = key[0][0] if isinstance(key, list) else None sample_items = BlockAccessor.for_block(samples).to_numpy(column) sample_items = np.sort(sample_items) ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.linspace(0, 1, num_reducers) ] return ret[1:]
def test_chaos_task_retry(set_kill_interval): # Chaos testing. @ray.remote(max_retries=-1) def task(): a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return @ray.remote(max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) # 50MB of return values. TOTAL_TASKS = 100 pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] start = time.time() pb.block_until_complete(results) runtime_with_failure = time.time() - start print(f"Runtime when there are many failures: {runtime_with_failure}") pb.close()
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: input_blocks_list = input_blocks.get_blocks() input_num_blocks = len(input_blocks_list) if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} if "scheduling_strategy" not in reduce_ray_remote_args: reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" shuffle_map = cached_remote_fn(self.map) shuffle_reduce = cached_remote_fn(self.reduce) map_bar = ProgressBar("Shuffle Map", total=input_num_blocks) shuffle_map_out = [ shuffle_map.options( **map_ray_remote_args, num_returns=1 + output_num_blocks, ).remote(i, block, output_num_blocks, *self._map_args) for i, block in enumerate(input_blocks_list) ] # The first item returned is the BlockMetadata. shuffle_map_metadata = [] for i, refs in enumerate(shuffle_map_out): shuffle_map_metadata.append(refs[0]) shuffle_map_out[i] = refs[1:] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks_list if clear_input_blocks: input_blocks.clear() shuffle_map_metadata = map_bar.fetch_until_complete( shuffle_map_metadata) map_bar.close() reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options( **reduce_ray_remote_args, num_returns=2, ).remote( *self._reduce_args, *[shuffle_map_out[i][j] for i in range(input_num_blocks)], ) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) new_metadata = reduce_bar.fetch_until_complete(list(new_metadata)) reduce_bar.close() stats = { "map": shuffle_map_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def _apply( self, fn: Any, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, name: Optional[str] = None, ) -> BlockList: """Note: this is not part of the Dataset public API.""" context = DatasetContext.get_current() blocks_in = block_list.get_blocks_with_metadata() # Early release block references. if clear_input_blocks: block_list.clear() orig_num_blocks = len(blocks_in) results = [] if name is None: name = "map" name = name.title() map_bar = ProgressBar(name, total=orig_num_blocks) class BlockWorker: def ready(self): return "ok" def map_block_split(self, block: Block, input_files: List[str]) -> BlockPartition: return _map_block_split(block, fn, input_files) @ray.method(num_returns=2) def map_block_nosplit( self, block: Block, input_files: List[str]) -> Tuple[Block, BlockMetadata]: return _map_block_nosplit(block, fn, input_files) if not remote_args: remote_args["num_cpus"] = 1 remote_args["scheduling_strategy"] = context.scheduling_strategy BlockWorker = ray.remote(**remote_args)(BlockWorker) workers = [BlockWorker.remote() for _ in range(self.min_size)] tasks = {w.ready.remote(): w for w in workers} tasks_in_flight = collections.defaultdict(int) metadata_mapping = {} block_indices = {} ready_workers = set() while len(results) < orig_num_blocks: ready, _ = ray.wait(list(tasks.keys()), timeout=0.01, num_returns=1, fetch_local=False) if not ready: if (len(workers) < self.max_size and len(ready_workers) / len(workers) > 0.8): w = BlockWorker.remote() workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers))) continue [obj_id] = ready worker = tasks.pop(obj_id) # Process task result. if worker in ready_workers: results.append(obj_id) tasks_in_flight[worker] -= 1 map_bar.update(1) else: ready_workers.add(worker) map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers))) # Schedule a new task. while (blocks_in and tasks_in_flight[worker] < self.max_tasks_in_flight_per_actor): block, meta = blocks_in.pop() if context.block_splitting_enabled: ref = worker.map_block_split.remote( block, meta.input_files) else: ref, meta_ref = worker.map_block_nosplit.remote( block, meta.input_files) metadata_mapping[ref] = meta_ref tasks[ref] = worker block_indices[ref] = len(blocks_in) tasks_in_flight[worker] += 1 map_bar.close() new_blocks, new_metadata = [], [] # Put blocks in input order. results.sort(key=block_indices.get) if context.block_splitting_enabled: for result in ray.get(results): for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block in results: new_blocks.append(block) new_metadata.append(metadata_mapping[block]) new_metadata = ray.get(new_metadata) return BlockList(new_blocks, new_metadata)
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, merge_factor: int = 2, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: logger.info("Using experimental push-based shuffle.") # TODO(swang): For jobs whose reduce work is heavier than the map work, # we should support fractional merge factors. # TODO(swang): For large jobs, we should try to choose the merge factor # automatically, e.g., by running one test round of map and merge tasks # and comparing their run times. # TODO(swang): Add option to automatically reduce write amplification # during map-merge stage, by limiting how many partitions can be # processed concurrently. input_blocks_list = input_blocks.get_blocks() # Preemptively clear the blocks list since we will incrementally delete # the last remaining references as we submit the dependent map tasks # during the map-merge stage. if clear_input_blocks: input_blocks.clear() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} # The placement strategy for reduce tasks is overwritten to colocate # them with their inputs from the merge stage, so remove any # pre-specified scheduling strategy here. reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args.pop("scheduling_strategy", None) map_fn = self._map_partition merge_fn = self._merge def map_partition(*args, **kwargs): return map_fn(self.map, *args, **kwargs) def merge(*args, **kwargs): return merge_fn(self.reduce, *args, **kwargs) shuffle_map = cached_remote_fn(map_partition) shuffle_merge = cached_remote_fn(merge) def submit_map_task(arg): mapper_idx, block = arg # NOTE(swang): Results are shuffled between map and merge tasks, so # there is no advantage to colocating specific map and merge tasks. # Therefore, we do not specify a node affinity policy for map tasks # in case the caller or Ray has a better scheduling strategy, e.g., # based on data locality. map_result = shuffle_map.options( **map_ray_remote_args, num_returns=1 + schedule.num_merge_tasks_per_round, ).remote( mapper_idx, block, output_num_blocks, schedule, *self._map_args, ) metadata_ref = map_result.pop(0) return metadata_ref, map_result def submit_merge_task(arg): merge_idx, map_results = arg num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx) merge_result = shuffle_merge.options( num_returns=1 + num_merge_returns, **schedule.get_merge_task_options(merge_idx), ).remote( *map_results, reduce_args=self._reduce_args, ) metadata_ref = merge_result.pop(0) return metadata_ref, merge_result # Compute all constants used for task scheduling. num_cpus_per_node_map = _get_num_cpus_per_node_map() schedule = self._compute_shuffle_schedule( num_cpus_per_node_map, len(input_blocks_list), merge_factor, output_num_blocks, ) # ObjectRef results from the last round of tasks. Used to add # backpressure during pipelining of map and merge tasks. last_map_metadata_results = [] last_merge_metadata_results = [] # Final outputs from the map-merge stage. # This is a map from merge task index to a nested list of merge results # (ObjectRefs). Each merge task index corresponds to a partition of P # final reduce tasks. all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)] shuffle_map_metadata = [] shuffle_merge_metadata = [] map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list)) # Execute the map-merge stage. This submits tasks in rounds of M map # tasks and N merge tasks each. Task execution between map and merge is # pipelined, so that while executing merge for one round of inputs, we # also execute the map tasks for the following round. input_blocks_list = list(enumerate(input_blocks_list)) while input_blocks_list: # Execute one round of the map stage. # Pop from the inputs so that we can clear the memory ASAP. round_input_blocks = [] try: for _ in range(schedule.num_map_tasks_per_round): round_input_blocks.append(input_blocks_list.pop(0)) except IndexError: pass ( prev_map_metadata, last_map_metadata_results, map_results, ) = _execute_pipelined_stage( submit_map_task, last_map_metadata_results, round_input_blocks, progress_bar=map_bar, ) shuffle_map_metadata += prev_map_metadata # Shuffle the map results for the merge tasks. merge_args = [ (merge_idx, [map_result.pop(0) for map_result in map_results]) for merge_idx in range(schedule.num_merge_tasks_per_round) ] assert all([not map_result for map_result in map_results]) # Execute one round of the merge stage. ( prev_merge_metadata, last_merge_metadata_results, merge_results, ) = _execute_pipelined_stage( submit_merge_task, last_merge_metadata_results, merge_args, ) shuffle_merge_metadata += prev_merge_metadata for merge_idx, merge_result in enumerate(merge_results): all_merge_results[merge_idx].append(merge_result) del merge_results # Wait for last map and merge tasks to finish. prev_map_metadata, _, _ = _execute_pipelined_stage( None, last_map_metadata_results, [], progress_bar=map_bar ) shuffle_map_metadata += prev_map_metadata map_bar.close() prev_merge_metadata, _, _ = _execute_pipelined_stage( None, last_merge_metadata_results, [] ) shuffle_merge_metadata += prev_merge_metadata # Execute and wait for the reduce stage. new_metadata, new_blocks = self._execute_reduce_stage( output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results ) stats = { "map": shuffle_map_metadata, "merge": shuffle_merge_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def fast_repartition(blocks, num_blocks): from ray.data.dataset import Dataset wrapped_ds = Dataset( ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False ) # Compute the (n-1) indices needed for an equal split of the data. count = wrapped_ds.count() dataset_format = wrapped_ds._dataset_format() indices = [] cur_idx = 0 for _ in range(num_blocks - 1): cur_idx += count / num_blocks indices.append(int(cur_idx)) assert len(indices) < num_blocks, (indices, num_blocks) if indices: splits = wrapped_ds.split_at_indices(indices) else: splits = [wrapped_ds] # TODO(ekl) include stats for the split tasks. We may also want to # consider combining the split and coalesce tasks as an optimization. # Coalesce each split into a single block. reduce_task = cached_remote_fn(_ShufflePartitionOp.reduce).options(num_returns=2) reduce_bar = ProgressBar("Repartition", position=0, total=len(splits)) reduce_out = [ reduce_task.remote(False, None, *s.get_internal_block_refs()) for s in splits if s.num_blocks() > 0 ] # Early-release memory. del splits, blocks, wrapped_ds new_blocks, new_metadata = zip(*reduce_out) new_blocks, new_metadata = list(new_blocks), list(new_metadata) new_metadata = reduce_bar.fetch_until_complete(new_metadata) reduce_bar.close() # Handle empty blocks. if len(new_blocks) < num_blocks: from ray.data._internal.arrow_block import ArrowBlockBuilder from ray.data._internal.pandas_block import PandasBlockBuilder from ray.data._internal.simple_block import SimpleBlockBuilder num_empties = num_blocks - len(new_blocks) if dataset_format == "arrow": builder = ArrowBlockBuilder() elif dataset_format == "pandas": builder = PandasBlockBuilder() else: builder = SimpleBlockBuilder() empty_block = builder.build() empty_meta = BlockAccessor.for_block(empty_block).get_metadata( input_files=None, exec_stats=None ) # No stats for empty block. empty_blocks, empty_metadata = zip( *[(ray.put(empty_block), empty_meta) for _ in range(num_empties)] ) new_blocks += empty_blocks new_metadata += empty_metadata return BlockList(new_blocks, new_metadata), {}
def _apply( self, block_fn: BlockTransform, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, name: Optional[str] = None, fn: Optional[UDF] = None, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, fn_constructor_kwargs: Optional[Dict[str, Any]] = None, ) -> BlockList: """Note: this is not part of the Dataset public API.""" if fn_args is None: fn_args = tuple() if fn_kwargs is None: fn_kwargs = {} if fn_constructor_args is None: fn_constructor_args = tuple() if fn_constructor_kwargs is None: fn_constructor_kwargs = {} context = DatasetContext.get_current() blocks_in = block_list.get_blocks_with_metadata() # Early release block references. if clear_input_blocks: block_list.clear() orig_num_blocks = len(blocks_in) results = [] if name is None: name = "map" name = name.title() map_bar = ProgressBar(name, total=orig_num_blocks) class BlockWorker: def __init__( self, *fn_constructor_args: Any, **fn_constructor_kwargs: Any, ): if not isinstance(fn, CallableClass): if fn_constructor_args or fn_constructor_kwargs: raise ValueError( "fn_constructor_{kw}args only valid for CallableClass " f"UDFs, but got: {fn}" ) self.fn = fn else: self.fn = fn(*fn_constructor_args, **fn_constructor_kwargs) def ready(self): return "ok" def map_block_split( self, block: Block, input_files: List[str], *fn_args, **fn_kwargs, ) -> BlockPartition: return _map_block_split( block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs ) @ray.method(num_returns=2) def map_block_nosplit( self, block: Block, input_files: List[str], *fn_args, **fn_kwargs, ) -> Tuple[Block, BlockMetadata]: return _map_block_nosplit( block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs ) if "num_cpus" not in remote_args: remote_args["num_cpus"] = 1 if "scheduling_strategy" not in remote_args: ctx = DatasetContext.get_current() if ctx.scheduling_strategy == DEFAULT_SCHEDULING_STRATEGY: remote_args["scheduling_strategy"] = "SPREAD" else: remote_args["scheduling_strategy"] = ctx.scheduling_strategy BlockWorker = ray.remote(**remote_args)(BlockWorker) workers = [ BlockWorker.remote(*fn_constructor_args, **fn_constructor_kwargs) for _ in range(self.min_size) ] tasks = {w.ready.remote(): w for w in workers} tasks_in_flight = collections.defaultdict(int) metadata_mapping = {} block_indices = {} ready_workers = set() try: while len(results) < orig_num_blocks: ready, _ = ray.wait( list(tasks.keys()), timeout=0.01, num_returns=1, fetch_local=False ) if not ready: if ( len(workers) < self.max_size and len(ready_workers) / len(workers) > self.ready_to_total_workers_ratio ): w = BlockWorker.remote( *fn_constructor_args, **fn_constructor_kwargs ) workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers) ) ) continue [obj_id] = ready worker = tasks.pop(obj_id) # Process task result. if worker in ready_workers: results.append(obj_id) tasks_in_flight[worker] -= 1 map_bar.update(1) else: ready_workers.add(worker) map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers) ) ) # Schedule a new task. while ( blocks_in and tasks_in_flight[worker] < self.max_tasks_in_flight_per_actor ): block, meta = blocks_in.pop() if context.block_splitting_enabled: ref = worker.map_block_split.remote( block, meta.input_files, *fn_args, **fn_kwargs, ) else: ref, meta_ref = worker.map_block_nosplit.remote( block, meta.input_files, *fn_args, **fn_kwargs, ) metadata_mapping[ref] = meta_ref tasks[ref] = worker block_indices[ref] = len(blocks_in) tasks_in_flight[worker] += 1 map_bar.close() self.num_workers += len(workers) new_blocks, new_metadata = [], [] # Put blocks in input order. results.sort(key=block_indices.get) if context.block_splitting_enabled: for result in ray.get(results): for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block in results: new_blocks.append(block) new_metadata.append(metadata_mapping[block]) new_metadata = ray.get(new_metadata) return BlockList(new_blocks, new_metadata) except Exception as e: try: for worker in workers: ray.kill(worker) except Exception as err: logger.exception(f"Error killing workers: {err}") finally: raise e
def run_actor_workload(total_num_cpus, smoke): """Run actor-based workload. The test checks if actor restart -1 and task_retries -1 works as expected. It basically requires many actors to report the seqno to the centralized DB actor while there are failures. If at least once is guaranteed upon failures, this test shouldn't fail. """ @ray.remote(num_cpus=0) class DBActor: def __init__(self): self.letter_dict = set() def add(self, letter): self.letter_dict.add(letter) def get(self): return self.letter_dict @ray.remote(num_cpus=1, max_restarts=-1, max_task_retries=-1) class ReportActor: def __init__(self, db_actor): self.db_actor = db_actor def add(self, letter): ray.get(self.db_actor.add.remote(letter)) NUM_CPUS = int(total_num_cpus) multiplier = 2 # For smoke mode, run less number of tasks if smoke: multiplier = 1 TOTAL_TASKS = int(300 * multiplier) current_node_ip = ray.worker.global_worker.node_ip_address db_actors = [ DBActor.options(resources={ f"node:{current_node_ip}": 0.001 }).remote() for _ in range(NUM_CPUS) ] pb = ProgressBar("Chaos test", TOTAL_TASKS * NUM_CPUS) actors = [] for db_actor in db_actors: actors.append(ReportActor.remote(db_actor)) results = [] highest_reported_num = 0 for a in actors: for _ in range(TOTAL_TASKS): results.append(a.add.remote(str(highest_reported_num))) highest_reported_num += 1 pb.fetch_until_complete(results) pb.close() for actor in actors: ray.kill(actor) # Consistency check wait_for_condition( lambda: (ray.cluster_resources().get("CPU", 0) == ray. available_resources().get("CPU", 0)), timeout=60, ) letter_set = set() for db_actor in db_actors: letter_set.update(ray.get(db_actor.get.remote())) # Make sure the DB actor didn't lose any report. # If this assert fails, that means at least once actor task semantic # wasn't guaranteed. for i in range(highest_reported_num): assert str(i) in letter_set, i
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, merge_factor: int = 2, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: logger.info("Using experimental push-based shuffle.") # TODO(swang): For jobs whose reduce work is heavier than the map work, # we should support fractional merge factors. # TODO(swang): For large jobs, we should try to choose the merge factor # automatically, e.g., by running one test round of map and merge tasks # and comparing their run times. # TODO(swang): Add option to automatically reduce write amplification # during map-merge stage, by limiting how many partitions can be # processed concurrently. input_blocks_list = input_blocks.get_blocks() # Preemptively clear the blocks list since we will incrementally delete # the last remaining references as we submit the dependent map tasks # during the map-merge stage. if clear_input_blocks: input_blocks.clear() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} # The placement strategy for reduce tasks is overwritten to colocate # them with their inputs from the merge stage, so remove any # pre-specified scheduling strategy here. reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args.pop("scheduling_strategy", None) # Compute all constants used for task scheduling. num_cpus_per_node_map = _get_num_cpus_per_node_map() stage = self._compute_shuffle_schedule( num_cpus_per_node_map, len(input_blocks_list), merge_factor, output_num_blocks, ) map_fn = self._map_partition merge_fn = self._merge def map_partition(*args, **kwargs): return map_fn(self.map, *args, **kwargs) def merge(*args, **kwargs): return merge_fn(self.reduce, *args, **kwargs) shuffle_map = cached_remote_fn(map_partition) shuffle_map = shuffle_map.options( **map_ray_remote_args, num_returns=1 + stage.num_merge_tasks_per_round, ) map_stage_iter = _MapStageIterator( input_blocks_list, shuffle_map, [output_num_blocks, stage.merge_schedule, *self._map_args], ) map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list)) map_stage_executor = _PipelinedStageExecutor( map_stage_iter, stage.num_map_tasks_per_round, progress_bar=map_bar) shuffle_merge = cached_remote_fn(merge) merge_stage_iter = _MergeStageIterator(map_stage_iter, shuffle_merge, stage, self._reduce_args) merge_stage_executor = _PipelinedStageExecutor( merge_stage_iter, stage.num_merge_tasks_per_round, max_concurrent_rounds=2) # Execute the map-merge stage. This submits tasks in rounds of M map # tasks and N merge tasks each. Task execution between map and merge is # pipelined, so that while executing merge for one round of inputs, we # also execute the map tasks for the following round. map_done = False merge_done = False map_stage_metadata = [] merge_stage_metadata = [] while not (map_done and merge_done): try: map_stage_metadata += next(map_stage_executor) except StopIteration: map_done = True break try: merge_stage_metadata += next(merge_stage_executor) except StopIteration: merge_done = True break map_bar.close() all_merge_results = merge_stage_iter.pop_merge_results() # Execute and wait for the reduce stage. reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) shuffle_reduce = cached_remote_fn(self.reduce) reduce_stage_iter = _ReduceStageIterator( stage, shuffle_reduce, all_merge_results, reduce_ray_remote_args, self._reduce_args, ) max_reduce_tasks_in_flight = output_num_blocks ctx = DatasetContext.get_current() if ctx.pipeline_push_based_shuffle_reduce_tasks: # If pipelining is enabled, we should still try to utilize all # cores. max_reduce_tasks_in_flight = min( max_reduce_tasks_in_flight, sum(num_cpus_per_node_map.values())) reduce_stage_executor = _PipelinedStageExecutor( reduce_stage_iter, max_reduce_tasks_in_flight, max_concurrent_rounds=2, progress_bar=reduce_bar, ) reduce_stage_metadata = [] while True: try: reduce_stage_metadata += next(reduce_stage_executor) except StopIteration: break new_blocks = reduce_stage_iter.pop_reduce_results() sorted_blocks = [(block[0], block[1], reduce_stage_metadata[i]) for i, block in enumerate(new_blocks)] sorted_blocks.sort(key=lambda x: x[0]) _, new_blocks, reduce_stage_metadata = zip(*sorted_blocks) del sorted_blocks assert ( len(new_blocks) == output_num_blocks ), f"Expected {output_num_blocks} outputs, produced {len(new_blocks)}" reduce_bar.close() stats = { "map": map_stage_metadata, "merge": merge_stage_metadata, "reduce": reduce_stage_metadata, } return BlockList(list(new_blocks), list(reduce_stage_metadata)), stats