def test_local_cluster_adapt(self): adapt_kwargs = {"minimum": 1, "maximum": 1} called_with = None class MyCluster(distributed.LocalCluster): def adapt(self, **kwargs): nonlocal called_with called_with = kwargs super().adapt(**kwargs) executor = DaskExecutor( cluster_class=MyCluster, cluster_kwargs={ "processes": False, "n_workers": 0 }, adapt_kwargs=adapt_kwargs, ) assert executor.adapt_kwargs == adapt_kwargs with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2 assert called_with == adapt_kwargs
def test_start_local_cluster(self, caplog): executor = DaskExecutor(cluster_kwargs={"processes": False}) assert executor.cluster_class == distributed.LocalCluster assert executor.cluster_kwargs == { "processes": False, "silence_logs": logging.CRITICAL, } with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2 assert any( "Creating a new Dask cluster" in rec.message for rec in caplog.records ) try: import bokeh # noqa except Exception: # If bokeh isn't installed, no dashboard will be started pass else: assert any( "The Dask dashboard is available at" in rec.message for rec in caplog.records )
def test_executor_disables_watch_worker_events_with_false(self): with distributed.Client(n_workers=1, processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address, watch_worker_status=False) with executor.start(): assert executor.watch_worker_status is False assert executor._watch_dask_events_task is None
def test_start_local_cluster(self): executor = DaskExecutor(cluster_kwargs={"processes": False}) assert executor.cluster_class == distributed.LocalCluster assert executor.cluster_kwargs == { "processes": False, "silence_logs": logging.CRITICAL, } with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2
def test_connect_to_running_cluster(self): with distributed.Client(processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address) assert executor.address == client.scheduler.address assert executor.cluster_class is None assert executor.cluster_kwargs is None assert executor.client_kwargs == {"set_as_default": False} with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2
def test_prep_dask_kwargs(self): executor = DaskExecutor() kwargs = executor._prep_dask_kwargs( dict(task_name="FISH!", task_tags=["dask-resource:GPU=1"])) assert kwargs["key"].startswith("FISH!-") assert kwargs["resources"] == {"GPU": 1.0} kwargs = executor._prep_dask_kwargs( dict(task_name="FISH!", task_tags=["dask-resource:GPU=1"], task_index=1)) assert kwargs["key"].startswith("FISH!-1-")
def test_executor_enables_watch_worker_events_with_true(self): with distributed.Client(n_workers=1, processes=False, set_as_default=False) as client: executor = DaskExecutor( address=client.scheduler.address, watch_worker_status=True, adapt_kwargs={"maximum": 4}, ) with executor.start(): assert executor.watch_worker_status is True assert executor._watch_dask_events_task is not None
def test_performance_report(self): # should not error assert DaskExecutor().performance_report == "" assert (DaskExecutor(performance_report_path="not a readable path"). performance_report == "") with tempfile.TemporaryDirectory() as report_dir: with open(f"{report_dir}/report.html", "w") as fp: fp.write("very advanced report") assert (DaskExecutor( performance_report_path=f"{report_dir}/report.html"). performance_report == "very advanced report")
def test_disable_cancellation_event(self, disabled): executor = DaskExecutor(cluster_kwargs={"processes": False}, disable_cancellation_event=disabled) with executor.start(): # Can run futures either way res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2 if disabled: assert executor._should_run_event is None else: assert executor._should_run_event is not None
def _get_executor(custom_port_range, name="local"): # See https://github.com/equinor/ert/pull/2757#discussion_r794368854 _, port, sock = find_available_port(custom_range=custom_port_range) sock.close() # do this explicitly, not relying on GC if name == "local": cluster_kwargs = { "silence_logs": "debug", } return LocalDaskExecutor(**cluster_kwargs) elif name == "lsf": LSFJob._submit_job = _eq_submit_job cluster_kwargs = { "queue": "mr", "project": None, "cores": 1, "memory": "1GB", "use_stdin": True, "n_workers": 2, "silence_logs": "debug", "scheduler_options": { "port": port }, } return DaskExecutor( cluster_class="dask_jobqueue.LSFCluster", cluster_kwargs=cluster_kwargs, debug=True, ) elif name == "pbs": cluster_kwargs = { "n_workers": 10, "queue": "normal", "project": "ERT-TEST", "local_directory": "$TMPDIR", "cores": 1, "memory": "32gb", "resource_spec": "select=1:ncpus=1:mem=32gb", "scheduler_options": { "port": port }, "extra": ["--worker-port", "51820:51840"], } return DaskExecutor( cluster_class="dask_jobqueue.PBSCluster", cluster_kwargs=cluster_kwargs, debug=True, ) else: raise ValueError(f"Unknown executor name {name}")
def test_connect_to_running_cluster(self, caplog): with distributed.Client(processes=False, set_as_default=False) as client: address = client.scheduler.address executor = DaskExecutor(address=address) assert executor.address == address assert executor.cluster_class is None assert executor.cluster_kwargs is None assert executor.client_kwargs == {"set_as_default": False} with executor.start(): res = executor.wait(executor.submit(lambda x: x + 1, 1)) assert res == 2 exp = f"Connecting to an existing Dask cluster at {address}" assert any(exp in rec.message for rec in caplog.records)
def test_executor_logs_worker_events(self, caplog): caplog.set_level(logging.DEBUG, logger="prefect") with distributed.Client(n_workers=1, processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address) with executor.start(): client.cluster.scale(4) while len(client.scheduler_info()["workers"]) < 4: time.sleep(0.1) client.cluster.scale(1) while len(client.scheduler_info()["workers"]) > 1: time.sleep(0.1) assert any("Worker %s added" == rec.msg for rec in caplog.records) assert any("Worker %s removed" == rec.msg for rec in caplog.records)
def test_cluster_class_and_kwargs(self): pytest.importorskip("distributed.deploy.spec") executor = DaskExecutor( cluster_class="distributed.deploy.spec.SpecCluster", cluster_kwargs={"some_kwarg": "some_val"}, client_kwargs={"set_as_default": True}, ) assert executor.cluster_class == distributed.deploy.spec.SpecCluster assert executor.cluster_kwargs == {"some_kwarg": "some_val"} assert executor.client_kwargs == {"set_as_default": True} class TestCluster(object): pass executor = DaskExecutor(cluster_class=TestCluster) assert executor.cluster_class == TestCluster
def run_flow( symbols: list, tick_type: str, start_date: str, # end_date: str=(datetime.utcnow() - timedelta(days=1)).isoformat().split('T')[0], # yesterday utc end_date: str = (dt.today() - timedelta(days=1)).isoformat(), # yesterday local tz n_workers: int = (cpu_count(logical=False)), threads_per_worker: int = 8, processes: bool = False): if type(symbols) != list: raise ValueError('symbols expects a list type') flow = get_flow() executor = DaskExecutor( cluster_kwargs={ 'n_workers': n_workers, 'processes': processes, 'threads_per_worker': threads_per_worker, }) # executor = LocalExecutor() flow_state = flow.run( executor=executor, symbols=symbols, tick_type=tick_type, start_date=start_date, end_date=end_date, ) return flow_state
def test_temporary_cluster_forcefully_cancels_pending_tasks(self, tmpdir): filname = tmpdir.join("signal") def slow(): time.sleep(10) with open(filname, "w") as f: f.write("Got here") executor = DaskExecutor() with executor.start(): start = time.time() fut = executor.submit(slow) # noqa time.sleep(0.1) stop = time.time() # Cluster shutdown before task could complete assert stop - start < 5 assert not os.path.exists(filname)
def _get_executor(name="local"): if name == "local": cluster_kwargs = { "silence_logs": "debug", "scheduler_options": { "port": find_open_port() }, } return LocalDaskExecutor(**cluster_kwargs) elif name == "lsf": LSFJob._submit_job = _eq_submit_job cluster_kwargs = { "queue": "mr", "project": None, "cores": 1, "memory": "1GB", "use_stdin": True, "n_workers": 2, "silence_logs": "debug", "scheduler_options": { "port": find_open_port() }, } return DaskExecutor( cluster_class="dask_jobqueue.LSFCluster", cluster_kwargs=cluster_kwargs, debug=True, ) elif name == "pbs": cluster_kwargs = { "n_workers": 10, "queue": "normal", "project": "ERT-TEST", "local_directory": "$TMPDIR", "cores": 4, "memory": "16GB", "resource_spec": "select=1:ncpus=4:mem=16GB", } return DaskExecutor( cluster_class="dask_jobqueue.PBSCluster", cluster_kwargs=cluster_kwargs, debug=True, ) else: raise ValueError(f"Unknown executor name {name}")
def mproc(): "Multi-processing executor using dask distributed" with Client( processes=True, scheduler_port=0, dashboard_address=":0", n_workers=2, threads_per_worker=1, ) as client: yield DaskExecutor(client.scheduler.address)
def mthread(): "Multi-threaded executor using dask distributed" with Client( processes=False, scheduler_port=0, dashboard_address=":0", n_workers=1, threads_per_worker=2, ) as client: yield DaskExecutor(client.scheduler.address)
def test_executor_logs_worker_events(self, caplog): caplog.set_level(logging.DEBUG, logger="prefect") with distributed.Client(n_workers=1, processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address) with executor.start(): assert executor.watch_worker_status is None assert executor._watch_dask_events_task is not None time.sleep(0.1) client.cluster.scale(4) while len(client.scheduler_info()["workers"]) < 4: time.sleep(0.1) client.cluster.scale(1) while len(client.scheduler_info()["workers"]) > 1: time.sleep(0.1) assert any( re.match("Worker .+ added", rec.msg) for rec in caplog.records) assert any( re.match("Worker .+ removed", rec.msg) for rec in caplog.records)
def test_exit_early_with_external_or_inproc_cluster_waits_for_pending_futures( self, kind, monkeypatch): key = "TESTING_%s" % uuid.uuid4().hex monkeypatch.setenv(key, "initial") def slow(): time.sleep(0.5) os.environ[key] = "changed" def pending(x): # This function shouldn't ever start, since it's pending when the # shutdown signal is received os.environ[key] = "changed more" if kind == "external": with distributed.Client(processes=False, set_as_default=False) as client: executor = DaskExecutor(address=client.scheduler.address) with executor.start(): fut = executor.submit(slow) fut2 = executor.submit(pending, fut) # noqa time.sleep(0.2) assert os.environ[key] == "changed" elif kind == "inproc": executor = DaskExecutor(cluster_kwargs={"processes": False}) with executor.start(): fut = executor.submit(slow) fut2 = executor.submit(pending, fut) # noqa time.sleep(0.2) assert os.environ[key] == "changed" assert executor.client is None assert executor._futures is None assert executor._should_run_event is None
def test_mapreduce_wordcount(): """Distributed wordcount Flow successfully executes using Dask distributed, which is deployed on a Kubernetes cluster. The Flow run's state also contains correct word count tuples stored in the state's associated Result object. """ url = ('https://raw.githubusercontent.com/KTH/ci-hackathon/master/' 'installations/ci-poetry/supercollider_src/poet10/poem.txt') executor = DaskExecutor(address=DASK_SCHEDULER_ADDR) state = mapreduce_wordcount.run(url=url, executor=executor) task_ref = mapreduce_wordcount.get_tasks('reducer')[0] result = state.result[task_ref].result # Get top 3 tokens result_top_tokens = sorted(result, key=lambda x: x[1])[-3:] expected_top_tokens = [('a', 4), ('and', 4), ('the', 5)] assert state.is_successful() assert result_top_tokens == expected_top_tokens
def execute(rec): ex = ExecutorClass() pipeline = rec.to_pipelines() plan = ex.pipelines_to_plan(pipeline) if request.param == "dask": client = Client(dask_cluster) if request.param == "prefect-dask": from prefect.executors import DaskExecutor prefect_executor = DaskExecutor( address=dask_cluster.scheduler_address) plan.run(executor=prefect_executor) else: ex.execute_plan(plan) if request.param == "dask": client.close() del client
def run(self, flow: "Flow") -> None: """ Run the flow using a temporary dask-kubernetes cluster. Args: - flow (Flow): the flow to run. """ # Call on_start callback if specified if self.on_start: self.on_start() try: from prefect.engine import get_default_flow_runner_class from prefect.executors import DaskExecutor from dask_kubernetes import KubeCluster if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod) else: with open( path.join(path.dirname(__file__), "worker_pod.yaml") ) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace") ) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc) ) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit()
def executor(self) -> Executor: pod_spec = make_pod_spec( image=self._image, memory_limit=self._pod_memory_limit, memory_request=self._pod_memory_request, threads_per_worker=self._pod_threads_per_worker, cpu_limit=self._pod_cpu_limit, cpu_request=self._pod_cpu_request, env=self._generate_env(), ) pod_spec.spec.containers[0].args.extend(["--resources", "TASKSLOTS=1"]) executor = DaskExecutor( cluster_class=lambda: KubeCluster(pod_spec, deploy_mode=self._deploy_mode), adapt_kwargs={ "minimum": self._adapt_min, "maximum": self._adapt_max }, ) return executor
def execute( # type: ignore self, flow: "Flow", **kwargs: Any # type: ignore ) -> None: """ Execute a flow run on a dask-cloudprovider cluster. Args: - flow (Flow): the Flow object - **kwargs (Any): Unused """ flow_run_info = None flow_run_id = prefect.context.get("flow_run_id") if self._on_execute: # If an on_execute Callable has been provided, retrieve the flow run parameters # and then allow the Callable a chance to update _provider_kwargs. This allows # better sizing of the cluster resources based on parameters for this Flow run. try: client = Client() flow_run_info = client.get_flow_run_info(flow_run_id) parameters = flow_run_info.parameters or {} # type: ignore self._on_execute(parameters, self._provider_kwargs) except Exception as exc: self.logger.info( "Failed to retrieve flow run info with error: {}".format(repr(exc)) ) if "image" not in self._provider_kwargs or not self._provider_kwargs.get( "image" ): # If image is not specified, use the Flow's image so that dependencies are # identical on all containers: Flow runner, Dask scheduler, and Dask workers flow_id = prefect.context.get("flow_id") try: client = Client() if not flow_id: # We've observed cases where flow_id is None if not flow_run_info: flow_run_info = client.get_flow_run_info(flow_run_id) flow_id = flow_run_info.flow_id flow_info = client.graphql( """query { flow(where: {id: {_eq: "%s"}}) { storage } }""" % flow_id ) storage_info = flow_info["data"]["flow"][0]["storage"] image = "{}/{}:{}".format( storage_info["registry_url"], storage_info["image_name"], storage_info["image_tag"], ) self.logger.info( "Using Flow's Docker image for Dask scheduler & workers: {}".format( image ) ) self._provider_kwargs["image"] = image except Exception as exc: self.logger.info( "Failed to retrieve flow info with error: {}".format(repr(exc)) ) self._create_dask_cluster() self.logger.info( "Executing on dynamically created Dask Cluster with scheduler address: {}".format( self.executor_kwargs["address"] ) ) if self.on_start: self.on_start() try: from prefect.engine import get_default_flow_runner_class from prefect.executors import DaskExecutor runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=DaskExecutor(**self.executor_kwargs)) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc) ) raise finally: if self.on_exit: self.on_exit()
def run_sigla_pipeline(master_spreadsheet_id: str, google_api_credentials_path: str, db_connection_url: str): """ Run the SIGLA ETL pipeline Parameters ---------- master_spreadsheet_id: The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. db_connection_url: str The DB's connection url str. """ log.info("Finished pipeline set up, start running pipeline") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("SIGLA Data Pipeline") as flow: # Delete all documents from db clean_up_task = _clean_up(db_connection_url) # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id, google_api_credentials_path) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), upstream_tasks=[unmapped(clean_up_task)], ) # Transform list of SheetData into FormattedSheetData formatted_spreadsheets_data = _transform.map( flatten(spreadsheets_data)) # Create instituton filter gs_institution_filter = _create_filter_task([ gs_format.standard_institution, gs_format.multiple_sigla_answer_variable, ]) # Filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter( formatted_spreadsheets_data) # Create composite filter gs_composite_filter = _create_filter_task([ gs_format.composite_variable, gs_format.institution_and_composite_variable, ]) # Filter to list of composite formatted sheet data gs_composites_data = gs_composite_filter(formatted_spreadsheets_data) # Load instutional data load_institutions_data_task = _load_institutions_data.map( gs_institutions_data, unmapped(db_connection_url)) # Load composite data load_composites_data_task = _load_composites_data.map( gs_composites_data, unmapped(db_connection_url), upstream_tasks=[unmapped(load_institutions_data_task)], ) # Log spreadsheets that were loaded _log_spreadsheets(spreadsheets_data, upstream_tasks=[load_composites_data_task]) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
def run_external_link_checker( google_api_credentials_path: str, master_spreadsheet_id: Optional[str] = None, spreadsheet_ids_str: Optional[str] = None, ): """ Run the the external link checker. If a list of spreadsheet ids are provided, run the external link checker against the list of spreadsheet ids, instead of the spreadsheet ids gathered from the master spreadsheet. Parameters ---------- master_spreadsheet_id: str The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. spreadsheet_ids_str: Optional[str] The list spreadsheet ids, delimited by comma. """ log.info("Finished external link checker set up, start checking external link.") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Check external links") as flow: # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids( master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str ) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), ) # Extract links from list of SheetData # Get back list of list of URLData links_data = _extract_external_links.map(flatten(spreadsheets_data)) # Unique the url data unique_links_data = _unique_external_links(flatten(links_data)) # Check external links _check_external_link.map(unique_links_data) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # Get the list of CheckedURL checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result log.info("=" * 80) # Get error links error_links = [link for link in checked_links if link.has_error] gs_cells = [] for error_link in error_links: for cell in error_link.url_data.cells: gs_cells.append( GoogleSheetCell( spreadsheet_title=cell.spreadsheet_title, sheet_title=cell.sheet_title, row_index=cell.row_index, col_index=cell.col_index, url=error_link.url_data.url, msg=error_link.msg, ) ) sorted_gs_cells = sorted( gs_cells, key=lambda x: ( x.spreadsheet_title, x.sheet_title, x.row_index, x.col_index, x.url, ), ) # Write error links to a csv file with open("external_links.csv", mode="w") as csv_file: fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for gs_cell in sorted_gs_cells: writer.writerow( { "spreadsheet_title": gs_cell.spreadsheet_title, "sheet_title": gs_cell.sheet_title, "cell": convert_rowcol_to_A1_name( gs_cell.row_index, gs_cell.col_index ), "url": gs_cell.url, "reason": f"{gs_cell.msg}", } ) log.info("Finished writing external links csv file")
def test_cant_specify_both_address_and_cluster_class(self): with pytest.raises(ValueError): DaskExecutor( address="localhost:8787", cluster_class=distributed.LocalCluster, )
def test_debug_is_converted_to_silence_logs(self, debug): executor = DaskExecutor(debug=debug) level = logging.WARNING if debug else logging.CRITICAL assert executor.cluster_kwargs["silence_logs"] == level
if self._worker_spec: worker_pod = self._worker_spec worker_pod = self._populate_worker_spec_yaml(yaml_obj=worker_pod) else: with open( path.join(path.dirname(__file__), "worker_pod.yaml") ) as pod_file: worker_pod = yaml.safe_load(pod_file) worker_pod = self._populate_worker_pod_yaml(yaml_obj=worker_pod) cluster = KubeCluster.from_dict( worker_pod, namespace=prefect.context.get("namespace") ) cluster.adapt(minimum=self.min_workers, maximum=self.max_workers) executor = DaskExecutor(address=cluster.scheduler_address) runner_cls = get_default_flow_runner_class() runner_cls(flow=flow).run(executor=executor) except Exception as exc: self.logger.exception( "Unexpected error raised during flow run: {}".format(exc) ) raise exc finally: # Call on_exit callback if specified if self.on_exit: self.on_exit() def _extra_loggers(self) -> str: """ Set dask-kubernetes related loggers for debugging and providing more