def sort_main(): N = args.num_parts partitions = _load_manifest(constants.INPUT_MANIFEST_FILE) boundaries = sortlib.get_boundaries(N) mapper_results = np.empty((N, N), dtype=object) for part_id, node, path in partitions: if not args.skip_input: opt = { "resources": { f"node:{node}": 1 / args.num_parts }, "memory": args.part_size * 2, } else: opt = {} mapper_results[part_id, :] = mapper.options(**opt).remote( boundaries, part_id, path) reducer_results = [] for r in range(N): if not args.skip_output: opt = {"memory": args.part_size * 2} else: opt = {"memory": args.part_size * 2} blocks = mapper_results[:, r].tolist() ret = reducer.options(**opt).remote(r, *blocks) reducer_results.append(ret) reducer_results = ray.get(reducer_results) with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout: writer = csv.writer(fout) writer.writerows(reducer_results)
def sort_main(): partitions = _load_manifest(constants.INPUT_MANIFEST_FILE) boundaries = sortlib.get_boundaries(args.num_reducers) mapper_results = np.empty((args.num_mappers, args.num_reducers), dtype=object) for part_id, node, path in partitions: opt = {} if args.skip_input else { "resources": { f"node:{node}": 1 / args.num_mappers }, "memory": args.input_part_size * 1.2, } opt.update(num_returns=args.num_reducers) mapper_results[part_id, :] = mapper.options(**opt).remote( boundaries, part_id, path) reducer_results = [] for r in range(args.num_reducers): opt = { "memory": args.output_part_size * 1.0, } blocks = mapper_results[:, r].tolist() ret = reducer.options(**opt).remote(r, *blocks) reducer_results.append(ret) reducer_results = ray.get(reducer_results) if not args.skip_output: with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout: writer = csv.writer(fout) writer.writerows(reducer_results)
def sort_main(args: Args): parts = _load_manifest(args, constants.INPUT_MANIFEST_FILE) assert len(parts) == args.num_mappers boundaries = sortlib.get_boundaries(args.num_reducers) mapper_opt = { "num_returns": args.num_reducers, "num_cpus": os.cpu_count() / args.num_concurrent_rounds, } # Load balance across worker nodes by setting `num_cpus`. merge_results = np.empty((args.num_rounds, args.num_reducers), dtype=object) part_id = 0 with worker_placement_groups(args) as pgs: for round in range(args.num_rounds): # Limit the number of in-flight rounds. num_extra_rounds = round - args.num_concurrent_rounds + 1 if num_extra_rounds > 0: ray.wait( [f for f in merge_results.flatten() if f is not None], num_returns=num_extra_rounds * args.num_reducers, ) # Submit map tasks. mapper_results = np.empty( (args.num_mappers_per_round, args.num_reducers), dtype=object) for _ in range(args.num_mappers_per_round): _, node, path = parts[part_id] m = part_id % args.num_mappers_per_round mapper_results[m, :] = mapper.options(**mapper_opt).remote( args, part_id, boundaries, path) part_id += 1 # Submit merge tasks. merge_results[round, :] = [ merge_mapper_blocks.options(placement_group=pgs[r]).remote( args, r, round, *mapper_results[:, r].tolist()) for r in range(args.num_reducers) ] # Delete local references to mapper results. mapper_results = None # Submit second-stage reduce tasks. reducer_results = [ final_merge.options(placement_group=pgs[r]).remote( args, r, *merge_results[:, r].tolist()) for r in range(args.num_reducers) ] reducer_results = ray.get(reducer_results) if not args.skip_output: with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout: writer = csv.writer(fout) writer.writerows(reducer_results) logging.info(ray.internal.internal_api.memory_summary(stats_only=True))