def test_deprecated_local_processes(self): with pytest.warns(UserWarning, match="local_processes"): executor = DaskExecutor( cluster_class="distributed.LocalCluster", client_kwargs={"set_as_default": True}, local_processes=True, ) assert executor.cluster_class == distributed.LocalCluster assert executor.cluster_kwargs == { "processes": True, "silence_logs": logging.CRITICAL, } assert executor.client_kwargs == {"set_as_default": True} # When not using a LocalCluster, `local_processes` warns, but isn't # added to the kwargs with pytest.warns(UserWarning, match="local_processes"): class TestCluster(object): pass executor = DaskExecutor(cluster_class=TestCluster, local_processes=True) assert executor.cluster_class == TestCluster assert executor.cluster_kwargs == {} assert executor.client_kwargs == {"set_as_default": False}
def test_local_cluster_adapt(self): adapt_kwargs = {"minimum": 1, "maximum": 1} called_with = None class MyCluster(distributed.LocalCluster): def adapt(self, **kwargs): nonlocal called_with called_with = kwargs super().adapt(**kwargs) executor = DaskExecutor( cluster_class=MyCluster, cluster_kwargs={ "processes": False, "n_workers": 0 }, adapt_kwargs=adapt_kwargs, ) assert executor.adapt_kwargs == adapt_kwargs with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2 assert called_with == adapt_kwargs
def test_init_kwargs_are_passed_to_init(self, monkeypatch): client = MagicMock() monkeypatch.setattr(prefect.engine.executors.dask, "Client", client) executor = DaskExecutor(test_kwarg="test_value") with executor.start(): pass assert client.called assert client.call_args[-1]["test_kwarg"] == "test_value"
def test_task_names_are_passed_to_submit(self, monkeypatch): client = MagicMock() monkeypatch.setattr(distributed, "Client", client) executor = DaskExecutor() with executor.start(): with prefect.context(task_full_name="FISH!"): executor.submit(lambda: None) kwargs = client.return_value.__enter__.return_value.submit.call_args[1] assert kwargs["key"].startswith("FISH!")
def test_task_names_are_passed_to_map(self, monkeypatch): client = MagicMock() monkeypatch.setattr(prefect.engine.executors.dask, "Client", client) executor = DaskExecutor() with executor.start(): with prefect.context(task_full_name="FISH![0]"): executor.map(lambda: None, [1, 2]) kwargs = client.return_value.__enter__.return_value.map.call_args[1] assert kwargs["key"].startswith("FISH![0]")
def test_context_tags_are_passed_to_submit(self, monkeypatch): client = MagicMock() monkeypatch.setattr(distributed, "Client", client) executor = DaskExecutor() with executor.start(): with prefect.context(task_tags=["dask-resource:GPU=1"]): executor.submit(lambda: None) kwargs = client.return_value.__enter__.return_value.submit.call_args[1] assert kwargs["resources"] == {"GPU": 1.0}
def test_context_tags_are_passed_to_map(self, monkeypatch): client = MagicMock() monkeypatch.setattr(prefect.engine.executors.dask, "Client", client) executor = DaskExecutor() with executor.start(): with prefect.context(task_tags=["dask-resource:GPU=1"]): executor.map(lambda: None, [1, 2]) kwargs = client.return_value.__enter__.return_value.map.call_args[1] assert kwargs["resources"] == {"GPU": 1.0}
def test_connect_to_running_cluster(self): with distributed.Client(processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address) assert executor.address == client.scheduler.address assert executor.cluster_class is None assert executor.cluster_kwargs is None assert executor.client_kwargs == {"set_as_default": False} with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2
def test_start_local_cluster(self): executor = DaskExecutor(cluster_kwargs={"processes": False}) assert executor.cluster_class == distributed.LocalCluster assert executor.cluster_kwargs == { "processes": False, "silence_logs": logging.CRITICAL, } with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2
def test_prep_dask_kwargs(self): executor = DaskExecutor() kwargs = executor._prep_dask_kwargs( dict(task_name="FISH!", task_tags=["dask-resource:GPU=1"]) ) assert kwargs["key"].startswith("FISH!-") assert kwargs["resources"] == {"GPU": 1.0} kwargs = executor._prep_dask_kwargs( dict(task_name="FISH!", task_tags=["dask-resource:GPU=1"], task_index=1) ) assert kwargs["key"].startswith("FISH!-1-")
def mthread(): "Multi-threaded executor" with Client(processes=False, scheduler_port=0, dashboard_address=":0", n_workers=2) as client: yield DaskExecutor(client.scheduler.address)
def run(src_dir, dst_dir, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) with Flow("classify_pipeline") as flow: # load data h5_paths = find_src_files(src_dir, "h5") info = preload_array_info(h5_paths) prob_map = read_prob_map.map(h5_paths, unmapped(info)) # classify label = classify.map(prob_map) # save tiff_paths = build_path.map(unmapped(dst_dir), h5_paths, unmapped("tif")) write_tiff.map(tiff_paths, label) if debug: flow.visualize() else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def main(): with Flow("Check listings", environment=LocalEnvironment(executor=DaskExecutor())) as flow: city = Parameter("city") ## Extract # get the current listings listings = get_current_listings(city) # fetch the pages pages = fetch_pages(listings, city) ## Transform # parse the listings data = parse_listings(pages) # Load save_listings(data) # flow.storage = Docker(registry_url="bramevert/craig") # flow.run_config = DockerRun( # env={"GOOGLE_APPLICATION_CREDENTIALS": "/home/app/craiglist-crawler-a7aff758fc9d.json"}, # image="craig:latest", # labels=["bram-desktop"], # ) # flow.register(project_name="Craiglist Crawler") # flow.run_agent() flow.run(city="vancouver")
def run_flow(self) -> None: """ Run the flow from specified flow_file_path location using a Dask executor """ try: from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml( yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace")) cluster.adapt(minimum=1, maximum=1) # Load serialized flow from file and run it with a DaskExecutor with open( prefect.context.get("flow_file_path", "/root/.prefect/flow_env.prefect"), "rb", ) as f: flow = cloudpickle.load(f) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) sys.exit(0) # attempt to force resource cleanup except Exception as exc: self.logger.error( "Unexpected error raised during flow run: {}".format(exc)) raise exc
def test_executor_logs_worker_events(self, caplog): caplog.set_level(logging.DEBUG, logger="prefect") with distributed.Client( n_workers=1, processes=False, set_as_default=False ) as client: executor = DaskExecutor(address=client.scheduler.address) with executor.start(): client.cluster.scale(4) while len(client.scheduler_info()["workers"]) < 4: time.sleep(0.1) client.cluster.scale(1) while len(client.scheduler_info()["workers"]) > 1: time.sleep(0.1) assert any("Worker %s added" == rec.msg for rec in caplog.records) assert any("Worker %s removed" == rec.msg for rec in caplog.records)
def test_cluster_class_and_kwargs(self): pytest.importorskip("distributed.deploy.spec") executor = DaskExecutor( cluster_class="distributed.deploy.spec.SpecCluster", cluster_kwargs={"some_kwarg": "some_val"}, client_kwargs={"set_as_default": True}, ) assert executor.cluster_class == distributed.deploy.spec.SpecCluster assert executor.cluster_kwargs == {"some_kwarg": "some_val"} assert executor.client_kwargs == {"set_as_default": True} class TestCluster(object): pass executor = DaskExecutor(cluster_class=TestCluster) assert executor.cluster_class == TestCluster
def run(src_dir, dst_dir, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) with Flow("convert_pipeline") as flow: # load data tiff_paths = find_src_files(src_dir, "tif") info = preload_array_info(tiff_paths) raw_data = read_tiff.map(tiff_paths, unmapped(info)) # save as zarr for faster access zarr_paths = build_path.map(unmapped(dst_dir), tiff_paths, unmapped("zarr")) zarr_paths = write_zarr.map(zarr_paths, raw_data, unmapped("raw")) # convert h5_paths = build_path.map(unmapped(dst_dir), zarr_paths, unmapped("h5")) zarr_to_h5.map(zarr_paths, h5_paths) if debug: flow.visualize() else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def run(src_dir, dst_dir, config_path: str, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) # number of workers config_path = Parameter("config_path", config_path) with Flow("inference_pipeline") as flow: # list tiles tiff_paths = find_src_files(src_dir, "h5") parted_tiff_paths = partition_path_list(tiff_paths, 5) prob_paths = infer.map(parted_tiff_paths, unmapped(config_path), unmapped(dst_dir)) prob_paths = combine_path_list(prob_paths) if debug: flow.visualize(filename="flow_debug") else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def test_deprecated_client_kwargs(self): with pytest.warns(UserWarning, match="client_kwargs"): executor = DaskExecutor( cluster_class="distributed.LocalCluster", set_as_default=True, ) assert executor.cluster_kwargs == {"silence_logs": logging.CRITICAL} assert executor.client_kwargs == {"set_as_default": True}
def run_flow(self) -> None: """ Run the flow from specified flow_file_path location using a Dask executor """ from prefect.engine import get_default_flow_runner_class from prefect.engine.executors import DaskExecutor from dask_kubernetes import KubeCluster with open(path.join(path.dirname(__file__), "worker_pod.yaml")) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict(worker_pod) cluster.adapt(minimum=1, maximum=1) # Load serialized flow from file and run it with a DaskExecutor with open( prefect.context.get("flow_file_path", "/root/.prefect/flow_env.prefect"), "rb", ) as f: flow = cloudpickle.load(f) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor)
def mproc(): "Multi-processing executor" with Client(processes=True, scheduler_port=0, dashboard_address=":0", n_workers=2) as client: yield DaskExecutor(client.scheduler.address)
def mproc(): "Multi-processing executor" with Client(processes=True) as client: yield DaskExecutor(client.scheduler.address, local_processes=True) try: client.shutdown() except: pass
def prepare_executor(executor_type, executor_address=None): """Instantiate a prefect executor""" if executor_type == 'dask': if executor_address is not None: executor = DaskExecutor(executor_address) else: executor = DaskExecutor(local_processes=True) elif executor_type == "synchronous": executor = SynchronousExecutor() elif executor_type == 'local': executor = LocalExecutor() else: # Should not happen if click parameters are done correctly, but # kept for completeness raise ValueError(f'Unknown executor type "{executor_type}".') return executor
def test_temporary_cluster_forcefully_cancels_pending_tasks(self, tmpdir): filname = tmpdir.join("signal") def slow(): time.sleep(10) with open(filname, "w") as f: f.write("Got here") executor = DaskExecutor() with executor.start(): start = time.time() fut = executor.submit(slow) # noqa time.sleep(0.1) stop = time.time() # Cluster shutdown before task could complete assert stop - start < 5 assert not os.path.exists(filname)
def mthread(): "Multi-threaded executor" with Client(processes=False) as client: yield DaskExecutor(client.scheduler.address) try: client.shutdown() except: pass
def main(config=config): """Executes vektis.agb.flow in DaskExecutor. """ executor = DaskExecutor(n_workers=8) flow.run( executor=executor, parameters={"gcp": config.gcp}, )
def run( self, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps raw = steps.Raw() # Choose executor if debug: exe = LocalExecutor() else: # Set up connection to computation cluster cluster = LocalCluster() # Inform of Dask UI log.info(f"Cluster dashboard available at: {cluster.dashboard_link}") # Create dask executor exe = DaskExecutor(cluster.scheduler_address) # Configure your flow with Flow("{{ cookiecutter.project_slug }}") as flow: # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. raw( clean=clean, debug=debug, **kwargs, # Allows us to pass `--n {some integer}` or other params ) # Run flow and get ending state state = flow.run(executor=exe) # Get and display any outputs you want to see on your local terminal log.info(raw.get_result(state, flow))
def mthread(): "Multi-threaded executor using dask distributed" with Client( processes=False, scheduler_port=0, dashboard_address=":0", n_workers=1, threads_per_worker=2, ) as client: yield DaskExecutor(client.scheduler.address)
def execute(flow: Flow) -> state: """ Returns: state: (state) state of league flow """ with raise_on_exception(): executor = DaskExecutor(address=os.getenv("WORKER_ADDRESS")) state = flow.run() return state
def mproc(): "Multi-processing executor using dask distributed" with Client( processes=True, scheduler_port=0, dashboard_address=":0", n_workers=2, threads_per_worker=1, ) as client: yield DaskExecutor(client.scheduler.address)