def test_simple_map(monkeypatch): flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) with prefect.Flow(name="test", result_handler=JSONResultHandler()) as flow: t1 = plus_one.map([0, 1, 2]) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id) ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t is not t1 ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): state = CloudFlowRunner(flow=flow).run(return_tasks=flow.tasks, executor=LocalExecutor()) assert state.is_successful() assert client.flow_runs[flow_run_id].state.is_successful() assert client.task_runs[task_run_id_1].state.is_mapped() # there should be a total of 4 task runs corresponding to the mapped task assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4)
def run( self, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps raw = steps.Raw() # Choose executor if debug: exe = LocalExecutor() else: # Set up connection to computation cluster cluster = LocalCluster() # Inform of Dask UI log.info(f"Cluster dashboard available at: {cluster.dashboard_link}") # Create dask executor exe = DaskExecutor(cluster.scheduler_address) # Configure your flow with Flow("{{ cookiecutter.project_slug }}") as flow: # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. raw( clean=clean, debug=debug, **kwargs, # Allows us to pass `--n {some integer}` or other params ) # Run flow and get ending state state = flow.run(executor=exe) # Get and display any outputs you want to see on your local terminal log.info(raw.get_result(state, flow))
def test_running_state_finishes(self): flow = Flow(name="test", tasks=[Task()]) new_state = FlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_successful()
def test_determine_final_state_preserves_running_states_when_tasks_still_running( self, ): task = Task() flow = Flow(name="test", tasks=[task]) old_state = Running() new_state = FlowRunner(flow=flow).get_flow_run_state( state=old_state, task_states={task: Retrying(start_time=pendulum.now("utc").add(days=1))}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state is old_state
def test_determine_final_state_has_final_say(self): class MyFlowRunner(FlowRunner): def determine_final_state(self, *args, **kwargs): return Failed("Very specific error message") flow = Flow(name="test", tasks=[Task()]) new_state = MyFlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_failed() assert new_state.message == "Very specific error message"
def prepare_executor(executor_type, executor_address=None): """Instantiate a prefect executor""" if executor_type == 'dask': if executor_address is not None: executor = DaskExecutor(executor_address) else: executor = DaskExecutor(local_processes=True) elif executor_type == "synchronous": executor = SynchronousExecutor() elif executor_type == 'local': executor = LocalExecutor() else: # Should not happen if click parameters are done correctly, but # kept for completeness raise ValueError(f'Unknown executor type "{executor_type}".') return executor
def test_can_queue_successfully_and_run(monkeypatch): @prefect.task def return_one(): return 1 with prefect.Flow("test-queues-work!") as flow: t1 = return_one() flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) client = QueueingMockCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun( id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id ), ] + [ TaskRun( id=str(uuid.uuid4()), task_slug=flow.slugs[t1], flow_run_id=flow_run_id ) for t in flow.tasks if t not in [ t1, ] ], monkeypatch=monkeypatch, num_times_in_queue=6, ) with prefect.context(flow_run_id=flow_run_id): run_state = CloudFlowRunner(flow=flow).run( executor=LocalExecutor(), return_tasks=flow.tasks ) assert run_state.is_successful() # Pending -> Running -> Queued (4x) -> Success # State transitions that result in `set_flow_run_state` calls are from # Pending -> Running and Running -> Success, all others # are from Running -> Queued or Queued -> Queued assert client.call_count["set_flow_run_state"] == 2 + (client.num_times_in_queue)
def test_prefect_executors(train_data, grid_search, parallel_columns): try: from prefect.engine.executors import DaskExecutor from prefect.engine.executors import LocalDaskExecutor from prefect.engine.executors import LocalExecutor from dask.distributed import Client except: print("`prefect` not installed, skipping the test...") pass else: client = Client() executors = { "dask_already_running": DaskExecutor(address=client.scheduler.address), "local": LocalExecutor(), "local_dask": LocalDaskExecutor(), "dask_create_on_call": DaskExecutor( ), # this spins up LocalDaskExecutor, but just to check the interface } for executor_name, executor in executors.items(): flow, state = run_model_selection( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, visualize_success=False, executor=executor, ) assert state.is_successful() results = select_model_general( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, executor=executor, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, ) assert len(results) == len( train_data[parallel_columns + ["Product"]].drop_duplicates()) assert isinstance(results[0], ModelSelectorResult) if executor_name == "dask_already_running": client.shutdown() if client.status != "closed": client.shutdown()
def run( self, dataset: str, include_raw: bool = False, batch_size: Optional[int] = None, distributed: bool = False, n_workers: int = 10, worker_cpu: int = 8, worker_mem: str = "120GB", overwrite: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- dataset: str The dataset to use for the pipeline. include_raw: bool A boolean option to determine if the raw data should be included in the Quilt package. Default: False (Do not include the raw data) batch_size: Optional[int] An optional batch size to provide to each step for processing their items. Default: None (auto batch size depending on CPU / threads available) distributed: bool A boolean option to determine if the jobs should be distributed to a SLURM cluster when possible. Default: False (Do not distribute) n_workers: int Number of workers to request (when distributed is enabled). Default: 10 worker_cpu: int Number of cores to provide per worker (when distributed is enabled). Default: 8 worker_mem: str Amount of memory to provide per worker (when distributed is enabled). Default: 120GB overwrite: bool If this pipeline has already partially or completely run, should it overwrite the previous files or not. Default: False (Do not overwrite or regenerate files) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Additionally, if debug is True, any mapped operation will run on threads instead of processes. Default: False (Do not debug) """ # Initalize steps raw = steps.Raw() standardize_fov_array = steps.StandardizeFOVArray() single_cell_features = steps.SingleCellFeatures() single_cell_images = steps.SingleCellImages() diagnostic_sheets = steps.DiagnosticSheets() # Cluster / distributed defaults distributed_executor_address = None # Choose executor if debug: exe = LocalExecutor() log.info("Debug flagged. Will use threads instead of Dask.") else: if distributed: # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Create cluster log.info("Creating SLURMCluster") cluster = SLURMCluster( cores=worker_cpu, memory=worker_mem, queue="aics_cpu_general", walltime="9-23:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) # Spawn workers cluster.scale(jobs=n_workers) log.info("Created SLURMCluster") # Use the port from the created connector to set executor address distributed_executor_address = cluster.scheduler_address # Only auto batch size if it is not None if batch_size is None: # Batch size is n_workers * worker_cpu * 0.75 # We could just do n_workers * worker_cpu but 3/4 of that is safer batch_size = int(n_workers * worker_cpu * 0.75) # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") else: # Create local cluster log.info("Creating LocalCluster") cluster = LocalCluster() log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) # Configure your flow with Flow("actk") as flow: if include_raw: dataset = raw(dataset, **kwargs) standardized_fov_paths_dataset = standardize_fov_array( dataset=dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--desired_pixel_sizes [{float},{float},{float}]` **kwargs, ) single_cell_features_dataset = single_cell_features( dataset=standardized_fov_paths_dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--cell_ceiling_adjustment {int}` **kwargs, ) single_cell_images_dataset = single_cell_images( dataset=single_cell_features_dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--cell_ceiling_adjustment {int}` **kwargs, ) diagnostic_sheets( dataset=single_cell_images_dataset, distributed_executor_address=distributed_executor_address, overwrite=overwrite, # Allows us to pass `--metadata {str}`, # `--feature {str}'` **kwargs, ) # Run flow and get ending state, log duration start = datetime.now() state = flow.run(executor=exe) duration = datetime.now() - start log.info(f"Total duration of pipeline: " f"{duration.seconds // 60 // 60}:" f"{duration.seconds // 60}:" f"{duration.seconds % 60}") # Get and display any outputs you want to see on your local terminal log.info(single_cell_images_dataset.get_result(state, flow))
def test_deep_map_with_a_retry(monkeypatch): """ Creates a situation in which a deeply-mapped Flow encounters a one-time error in one of the middle layers. Running the flow a second time should resolve the error. DOES NOT WORK WITH DASK EXECUTORS because of the need for shared state on second run """ flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) task_run_id_3 = str(uuid.uuid4()) with prefect.Flow(name="test", result_handler=JSONResultHandler()) as flow: t1 = plus_one.map([-1, 0, 1]) t2 = invert_fail_once.map(t1) t3 = plus_one.map(t2) t2.max_retries = 1 t2.retry_delay = datetime.timedelta(seconds=100) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=t1.slug, flow_run_id=flow_run_id), TaskRun(id=task_run_id_2, task_slug=t2.slug, flow_run_id=flow_run_id), TaskRun(id=task_run_id_3, task_slug=t3.slug, flow_run_id=flow_run_id), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=t.slug, flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, t2, t3] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_mapped() assert client.task_runs[task_run_id_3].state.is_mapped() # there should be a total of 4 task runs corresponding to each mapped task for t in [t1, t2, t3]: assert ( len([tr for tr in client.task_runs.values() if tr.task_slug == t.slug]) == 4 ) # t2's first child task should be retrying t2_0 = next( tr for tr in client.task_runs.values() if tr.task_slug == t2.slug and tr.map_index == 0 ) assert isinstance(t2_0.state, Retrying) # t3's first child task should be pending t3_0 = next( tr for tr in client.task_runs.values() if tr.task_slug == t3.slug and tr.map_index == 0 ) assert t3_0.state.is_pending() # RUN A SECOND TIME with an artificially updated start time failed_id = [ t_id for t_id, tr in client.task_runs.items() if tr.task_slug == t2.slug and tr.map_index == 0 ].pop() client.task_runs[failed_id].state.start_time = pendulum.now("UTC") with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) # t2's first child task should be successful t2_0 = next( tr for tr in client.task_runs.values() if tr.task_slug == t2.slug and tr.map_index == 0 ) assert t2_0.state.is_successful() # t3's first child task should be successful t3_0 = next( tr for tr in client.task_runs.values() if tr.task_slug == t3.slug and tr.map_index == 0 ) assert t3_0.state.is_successful()
def test_non_keyed_states_are_hydrated_correctly_with_retries( monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies can affect the number of children spawned. """ @prefect.task def return_list(): return [1, 2, 3] @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20)) def fail_once(): if prefect.context.get("task_run_count", 0) < 2: raise SyntaxError("bad") else: return 100 flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = fail_once.map(upstream_tasks=[return_list]) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun( id=task_run_id_2, task_slug=flow.slugs[return_list], flow_run_id=flow_run_id, ), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, return_list] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_successful() # there should be a total of 4 task runs corresponding to each mapped task assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) # t1's first child task should be retrying assert all([ isinstance(tr.state, Retrying) for tr in client.task_runs.values() if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1) ]) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data for idx, tr in client.task_runs.items(): if tr.task_slug == flow.slugs[t1] and tr.map_index != -1: tr.state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) assert all(tr.state.is_successful() for tr in client.task_runs.values())
def test_is_pickleable_after_start(self): e = LocalExecutor() with e.start(): post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalExecutor)
def main(): p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline") p.add_argument( "-s", "--save_dir", action="store", default=Path("./results/"), help="Save directory for results", ) p.add_argument( "--dataset", type=str, default="quilt", help='Which dataset to use, current can be "quilt", or "labkey"', ) p.add_argument( "--n_fovs", type=int, default=100, help="Number of fov's per cell line to use.", ) p.add_argument( "--overwrite", type=utils.str2bool, default=False, help="overwite saved results" ) p.add_argument( "--use_current_results", type=utils.str2bool, default=False, help="Dont do any processing. just make figures. Set to True by default so you don't overwrite your stuff.", ) # distributed stuff p.add_argument( "--distributed", type=utils.str2bool, default=False, help="Use Prefect/Dask to do distributed compute.", ) p.add_argument( "--port", type=int, default=99999, help="Port over which to communicate with the Dask scheduler.", ) args = p.parse_args() args = vars(args) distributed = args.pop("distributed") port = args.pop("port") # For distributed instructions see: # https://github.com/AllenCellModeling/fov_processing_pipeline/blob/master/docs/distributed_instructions.md if distributed: from prefect.engine.executors import DaskExecutor executor = DaskExecutor(address=f"tcp://localhost:{port}") else: executor = LocalExecutor() args["executor"] = executor process(**args)
def run( self, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps select_data = steps.SelectData() compute_cell_metrics = steps.ComputeCellMetrics() gather_test_visualize = steps.GatherTestVisualize() # Choose executor if debug: exe = LocalExecutor() else: # Create local cluster log.info("Creating LocalCluster") current_mem_gb = psutil.virtual_memory().available / 2**30 n_workers = int(current_mem_gb // 4) cluster = LocalCluster(n_workers=n_workers) log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Batch size on local cluster batch_size = int(psutil.cpu_count() // n_workers) # Log dashboard URI log.info(f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) # Configure your flow with Flow("polar_express") as flow: # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. # step 1: select cells and store in annotation file selected_cells_manifest = select_data( clean=clean, debug=debug, distributed_executor_address=distributed_executor_address, batch_size=batch_size, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # step 2: compute metrics for each of the cells cell_metrics_manifest = compute_cell_metrics( selected_cells_manifest, clean=clean, debug=debug, distributed_executor_address=distributed_executor_address, batch_size=batch_size, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # step 3: gather the computed metrics and create visualizations gather_test_visualize( cell_metrics_manifest, clean=clean, debug=debug, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # Run flow and get ending state state = flow.run(executor=exe) # Get and display any outputs you want to see on your local terminal log.info(select_data.get_result(state, flow)) log.info(compute_cell_metrics.get_result(state, flow)) log.info(gather_test_visualize.get_result(state, flow))
load = postgres.load_datafile.map(datafile=downloads) # commit new data to database and clean up complete = postgres.complete_load() # make sure prep runs before load flow.add_edge(upstream_task=prep, downstream_task=load) # make sure load runs before complete flow.add_edge(upstream_task=load, downstream_task=complete) if __name__ == "__main__": logger = prefect.context.get("logger") dask = prefect.config.dask mode = prefect.config.mode reset_db = prefect.config.reset_db all_datasets = dict(prefect.config.socrata.datasets) years = list(prefect.config.data.years) # use only year datasets if in full mode otherwise use all w/since if mode == 'full': run_datasets = dict((k, all_datasets[k]) for k in years) else: run_datasets = all_datasets logger.info( f"Starting \"{mode}\" flow for {', '.join(run_datasets.keys())}" f" {'and resetting db' if reset_db else ''}") state = flow.run(datasets=list(run_datasets.values()), executor=LocalDaskExecutor() if dask else LocalExecutor())
def run( self, distributed: bool = False, overwrite: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- distributed: bool A boolean option to determine if the jobs should be distributed to a SLURM cluster when possible. Default: False (Do not distribute) overwrite: bool If this pipeline has already partially or completely run, should it overwrite the previous files or not. Default: False (Do not overwrite or regenerate files) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Additionally, if debug is True, any mapped operation will run on threads instead of processes. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps validate_dataset = steps.ValidateDataset() prep_analysis_sc = steps.PrepAnalysisSingleCellDs() # run_mito_class = steps.MitoClass() # merge_data_for_cfe = steps.MergeDataset() # Choose executor if debug: exe = LocalExecutor() distributed_executor_address = None log.info("Debug flagged. Will use threads instead of Dask.") else: if distributed: # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Create cluster log.info("Creating SLURMCluster") cluster = SLURMCluster( cores=1, memory="15GB", queue="aics_gpu_general", walltime="10:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) # Spawn workers cluster.scale(180) log.info("Created SLURMCluster") # Use the port from the created connector to set executor address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") else: # Create local cluster log.info("Creating LocalCluster") cluster = LocalCluster() log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) # Configure your flow with Flow("cvapipe") as flow: # Allows us to pass `--raw_dataset {some path}` validated_data_path = validate_dataset(**kwargs) prep_analysis_sc( dataset=validated_data_path, distributed_executor_address=distributed_executor_address, **kwargs, ) # mitotic classifier was implemented with plt. # PLT has its own distributed handler, which is not quite # compatible with prefect + dask """ cell_data_with_annotation = run_mito_class( dataset=single_cell_data_path, **kwargs, ) cell_data_cfe = merge_data_for_cfe( dataset_with_annotation=cell_data_with_annotation, dataset_from_labkey=validated_data_path, **kwargs, ) """ # Run flow and get ending state state = flow.run(executor=exe) # Get and display any outputs you want to see on your local terminal log.info(validate_dataset.get_result(state, flow))
def process( save_dir: Path, overwrite: bool, use_current_results: bool, n_fovs: int = 100, dataset: str = "quilt", executor=LocalExecutor(), ): """ Dask/Prefect distributed command for running pipeline """ save_dir = str(save_dir.resolve()) log.info("Saving in {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) # This is the main function with Flow("FOV_processing_pipeline") as flow: # for every FOV, do the processing steps ########### # load data ########### data = wrappers.save_load_data( save_dir, n_fovs=n_fovs, overwrite=overwrite, dataset=dataset ) # we have to unpack this way because of Prefect-reasons cell_data = data[0] fov_data = data[1] ########### # get all of the save paths ########### paths = wrappers.get_save_paths(save_dir, fov_data) # we have to unpack this way because of Prefect-reasons summary_path = paths[0] stats_paths = paths[1] proj_paths = paths[2] ########### # Summary Table ########### wrappers.cell_data_to_summary_table(cell_data, summary_path) ########### # The per-fov map step ########### fov_rows = wrappers.get_data_rows(fov_data) if not use_current_results: process_fov_row_map = wrappers.process_fov_row.map( fov_row=fov_rows, stats_path=stats_paths, proj_path=proj_paths, overwrite=unmapped(overwrite), ) upstream_tasks = [process_fov_row_map] else: upstream_tasks = None ########### # Load relevant data as a reduce step ########### df_stats = wrappers.load_stats( fov_data, stats_paths, upstream_tasks=upstream_tasks ) ########### # QC data based on previous thresholds, etc ########### df_stats_qc = wrappers.qc_stats(df_stats, save_dir) if not use_current_results: ########### # Make Plots ########### wrappers.stats2plots( df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc] ) ########### # Make diagnostic images ########### wrappers.im2diagnostics( fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats] ) ########### # Do data splits for the data that survived QC ########### splits_dict = wrappers.data_splits( df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc] ) state = flow.run(executor=executor) fov_data = state.result[flow.get_tasks(name="save_load_data")[0]].result[1] df_stats = state.result[flow.get_tasks(name="load_stats")[0]].result splits_dict = state.result[flow.get_tasks(name="data_splits")[0]].result log.info("Done!") return fov_data, df_stats, splits_dict
# Scrape the website tsx_imb_df = get_tsx_moc_imb(tsx_url) # Get the connection string from prefect cloud conn_str = PrefectSecret("moc_pgdb_conn") # Partition the df to tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn) df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str)) if __name__ == "__main__": # Inputs tsx_url = 'https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html' backup_url = "https://web.archive.org/web/20200414202757/https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html" # Script from prefect.engine.executors import LocalExecutor tsx_imb_fl.visualize() fl_state = tsx_imb_fl.run( parameters=dict( tsx_url=backup_url, n_conn=4 ), executor=LocalExecutor() ) tsx_imb_fl.visualize(flow_state=fl_state)
def local(): "Local, immediate execution executor" yield LocalExecutor()
def test_wait(self): """LocalExecutor's wait() method just returns its input""" assert LocalExecutor().wait(1) == 1 assert LocalExecutor().wait(prefect) is prefect
def test_submit(self): """LocalExecutor directly executes the function""" assert LocalExecutor().submit(lambda: 1) == 1 assert LocalExecutor().submit(lambda x: x, 1) == 1 assert LocalExecutor().submit(lambda x: x, x=1) == 1 assert LocalExecutor().submit(lambda: prefect) is prefect
def run( self, distributed: bool = False, clean: bool = False, debug: bool = False, structs: list = ["Nuc"], flow_viz: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- distributed: bool A boolean option to determine if the jobs should be distributed to a remote cluster when possible. Default: False (Do not distribute) clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) structs: List List of structure data to run pipeline on. Currently, only 'Nuc' (nuclear membrane) and 'Cell' (cell membrane) are supported. flow_viz: bool Make flow chart to visualize pipeline - requires conda install of graphviz. Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps if "Nuc" in structs: loaddata_nuc = steps.LoadData() shparam_nuc = steps.Shparam(step_name="shparam_nuc") avgshape_nuc = steps.Avgshape(step_name="avgshape_nuc") nma_nuc = steps.Nma(step_name="nma_nuc") if "Cell" in structs: single_cell = steps.Singlecell(step_name="single_cell") shparam_cell = steps.Shparam(step_name="shparam_cell") avgshape_cell = steps.Avgshape(step_name="avgshape_cell") nma_cell = steps.Nma(step_name="nma_cell") if "Nuc" in structs and "Cell" in structs: compare_nuc_cell = steps.CompareNucCell() # Choose executor if debug: exe = LocalExecutor() distributed_executor_address = None log.info(f"Debug flagged. Will use threads instead of Dask.") else: if distributed: # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Configure dask config dask.config.set({ "scheduler.work-stealing": False, "logging.distributed.worker": "info", }) # Create cluster log.info("Creating SLURMCluster") cluster = SLURMCluster( cores=4, memory="20GB", queue="aics_cpu_general", walltime="10:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) log.info("Created SLURMCluster") # Scale cluster cluster.scale(60) # Use the port from the created connector to set executor address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") else: # Create local cluster log.info("Creating LocalCluster") cluster = LocalCluster() log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) try: # Configure your flow with Flow("mti_nma") as flow: # If your step utilizes dask pass the executor address # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. if "Nuc" in structs: struct = "Nuc" ld_nuc_df = loaddata_nuc(distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) sh_nuc_df = shparam_nuc(sc_df=ld_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) avg_nuc_df = avgshape_nuc(sh_df=sh_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) nma_nuc_df = nma_nuc(avg_df=avg_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) if "Cell" in structs: struct = "Cell" sc_cell_df = single_cell(distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) sh_cell_df = shparam_cell(sc_df=sc_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) avg_cell_df = avgshape_cell(sh_df=sh_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) nma_cell_df = nma_cell(avg_df=avg_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) # If nucleus and cell membrane were anlyzed, draw comparison plot if "Nuc" in structs and "Cell" in structs: compare_nuc_cell(nma_nuc_df, nma_cell_df) # Run flow, get ending state, and visualize pipeline flow.run(executor=exe) # Create pipeline visualization if flag is True # Note: # Flag False by default as a required package is not pip-installable # To use this feature, first `conda install graphviz` if flow_viz: flow.visualize() # Catch any error and kill the remote dask cluster except Exception as err: log.error(f"Something went wrong during pipeline run: {err}")
def test_is_pickleable(self): e = LocalExecutor() post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalExecutor)
@task def transform(df): insert_loc = 0 for col_name in df.columns[0:4]: df.insert(insert_loc + 1, col_name + '_mean', df.groupby('species')[col_name].transform(np.mean)) df.insert(insert_loc + 2, col_name + '_stdev', df.groupby('species')[col_name].transform(np.std)) insert_loc += 3 return df @task def load(df): conn = create_engine( 'postgresql://*****:*****@redshift-cluster-1.cbcap9uylzfk.us-east-2.redshift.amazonaws.com:5439/dev' ) df.to_sql('iris_data', conn, index=False) with Flow("ETL") as flow: e = extract() t = transform(e) l = load(t) state = flow.run(executor=LocalExecutor())
def generate_movies( img: Union[str, Path], distributed_executor_port: Optional[Union[str, int]] = None, save_path: Optional[Union[str, Path]] = None, operating_dim: str = Dimensions.Time, overwrite: bool = False, fps: int = 12, quality: int = 6, save_format: str = "mp4", save_workflow: bool = False, normalization_func: Callable = single_channel_percentile_norm, normalization_kwargs: Dict[str, Any] = {}, projection_func: Callable = single_channel_max_project, projection_kwargs: Dict[str, Any] = {}, S: Optional[Union[int, slice]] = None, C: Optional[Union[int, slice]] = None, B: Union[int, slice] = 0, ) -> Path: """ Generate a movie for every scene and channel pair found in a file through an operating dimension. Parameters ---------- img: Union[str, Path] Path to a CZI file to read and generate movies for. distributed_executor_port: Optional[Union[str, int]] If provided a port to use for connecting to the distributed scheduler. All image computation and workflow tasks will be distributed using Dask. Default: None save_path: Optional[Union[str, Path]] A specific path to save the generated movies to. Default: The a directory named after the provided file. operating_dim: str Which dimension to operating through for each frame of the movie. Default: Dimensions.Time ("T") overwrite: bool Should existing files found under the same directory name be overwritten. Default: False fps: int Frames per second of each produces movie. Default: 12 quality: int ImageIO's compression system. 0 is high compression, 10 is no compression. Default: 6 save_format: str Which movie format should be used for each produced file. Default: mp4 Available: mov, avi, mpg, mpeg, mp4, mkv, wmv save_workflow: bool Optionally, save a PNG and PDF of the workflow that ran. If this is set to True, be sure you have installed graphviz and added it's executable to your PATH. Default: False normalization_func: Callable A function to normalize the entire movie data prior to projection. Default: timelapse_tools.normalization.single_channel_percentile_norm normalization_kwargs: Dict[str, Any] Any extra arguments to pass to the normalization function. Default: {} projection_func: Callable A function to project the data for at each frame of the movie. Default: timelapse_tools.projection.single_channel_max_project projection_kwargs: Dict[str, Any] Any extra arguments to pass to the projection function. Default: {} S: Optional[Union[int, slice]] A specific integer or slice to use for selecting down the scenes to process. Default: None (process all scenes) C: Optional[Union[int, slice]] A specific integer or slice to use for selecting down the channels to process. Default: None (process all channels) B: Union[int, slice] A specific integer or slice to use for selecting down the channels to process. Default: 0 Returns ------- save_path: Path The path to the produced scene-channel pairings of movies. """ if distributed_executor_port: from prefect.engine.executors import DaskExecutor executor = DaskExecutor( address=f"tcp://localhost:{distributed_executor_port}") else: from prefect.engine.executors import LocalExecutor executor = LocalExecutor() # Run all processing through prefect + dask for better # parallelization and task optimization with Flow("czi_to_mp4_conversion") as flow: # Convert img to Path img = Path(img).expanduser().resolve(strict=True) # Determine save path save_path = _get_save_path(save_path=save_path, overwrite=overwrite, fname=img.with_suffix("").name) # Setup and check image and operating dimension provided img_details = _img_prep( img=img, operating_dim=operating_dim, # Don't run if save path checking failed upstream_tasks=[save_path], ) # Select scene data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.Scene, dim_indicies_selected=S, ) # Select channel data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.Channel, dim_indicies_selected=C, ) # Select 'B' data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.B, dim_indicies_selected=B, ) # Generate all the indicie sets we will need to process getitem_indicies = _generate_getitem_indicies( img_shape=_get_image_shape(img_details[0]), dims=img_details[1]) # Generate all the movie selections to_process = _generate_process_list(img=img_details[0], getitem_indicies=getitem_indicies) # Generate a list of dictionaries that map dimension to selected data selected_indices = _generate_selected_dims_list( dims=img_details[1], getitem_indicies=getitem_indicies) # Generate movies for each _generate_movie.map( data=to_process, selected_indices=selected_indices, dims=unmapped(img_details[1]), operating_dim=unmapped(operating_dim), save_path=unmapped(save_path), fps=unmapped(fps), save_format=unmapped(save_format), normalization_func=unmapped(normalization_func), normalization_kwargs=unmapped(normalization_kwargs), projection_func=unmapped(projection_func), projection_kwargs=unmapped(projection_kwargs), ) # Run the flow state = flow.run(executor=executor) # Get resulting path save_path = state.result[flow.get_tasks(name="_get_save_path")[0]].result # Save the flow viz to the same save_path if save_workflow: flow.visualize(filename=str(save_path / "workflow.png")) return save_path
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly. """ flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = plus_one.map([-1, 0, 1]) t2 = invert_fail_once.map(t1) t2.max_retries = 1 t2.retry_delay = datetime.timedelta(minutes=100) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun(id=task_run_id_2, task_slug=flow.slugs[t2], flow_run_id=flow_run_id), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, t2] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_mapped() # there should be a total of 4 task runs corresponding to each mapped task for t in [t1, t2]: assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t] ]) == 4) # t2's first child task should be retrying t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert isinstance(t2_0.state, Retrying) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data failed_id = [ t_id for t_id, tr in client.task_runs.items() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0 ].pop() client.task_runs[failed_id].state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) # t2's first child task should be successful t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert t2_0.state.is_successful()