def main(): SOURCE = os.path.abspath('/Users/kenny.ells/data/dubai_his_arg.20191226') TARGET = os.path.abspath('tmp') FILES = sorted([os.path.join(SOURCE, f) for f in os.listdir(SOURCE)]) FILES = FILES[:5] if not os.path.exists(TARGET): os.mkdir(TARGET) with Flow('plotting') as flow: plot.plot_roms.map(ncfile=FILES, target=unmapped(TARGET), varname=unmapped('temp')) # When calling dask-scheduler and dask-worker at the command line # address_tcp = 'tcp://10.90.69.73:8786' # print(address_tcp) # executor = DaskExecutor(address=address_tcp, local_processes=True, debug=True) # Defining the client in the script # client = Client() # print(client) # address_tcp = client.scheduler.address # executor = DaskExecutor(address=address_tcp) # flow.run(executor=executor) # Ignoring Dask flow.run()
def flow(self): if len(self.targets) == 1: target = self.targets[0] else: raise ValueError( "Zarr target requires self.targets be a length one list") with Flow(self.name, storage=self.storage, environment=self.environment) as _flow: # download to cache nc_sources = download.map( self.sources, cache_location=unmapped(self.cache_location), ) # convert cached netcdf data to zarr cached_sources = nc2zarr.map( nc_sources, cache_location=unmapped(self.cache_location), ) # combine all datasets into a single zarr archive combine_and_write(cached_sources, target) return _flow
def build(year: int, league_id: int, cookies: dict) -> Flow: """ Flow builder with relevant tasks (increase modularity and abstraction) Args: year: (int) - year in which to make requests league_id: (int) - league id in which to make requests cookies: (dict) - auth cookies Returns: flow: (Flow) flow to be executed """ with Flow("league_flow") as flow: year = Parameter("year") league_id = Parameter("league_id") cookies = Parameter("cookies") req = url_generator(year=year, league_id=league_id) meta = fetch_league_meta(base_url=req, cookies=cookies) fetch_team_meta.map( base_url=unmapped(req), team_id=meta["team_ids"], cookies=unmapped(cookies), ) return flow
def run(src_dir, dst_dir, config_path: str, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) # number of workers config_path = Parameter("config_path", config_path) with Flow("inference_pipeline") as flow: # list tiles tiff_paths = find_src_files(src_dir, "h5") parted_tiff_paths = partition_path_list(tiff_paths, 5) prob_paths = infer.map(parted_tiff_paths, unmapped(config_path), unmapped(dst_dir)) prob_paths = combine_path_list(prob_paths) if debug: flow.visualize(filename="flow_debug") else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def run(src_dir, dst_dir, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) with Flow("classify_pipeline") as flow: # load data h5_paths = find_src_files(src_dir, "h5") info = preload_array_info(h5_paths) prob_map = read_prob_map.map(h5_paths, unmapped(info)) # classify label = classify.map(prob_map) # save tiff_paths = build_path.map(unmapped(dst_dir), h5_paths, unmapped("tif")) write_tiff.map(tiff_paths, label) if debug: flow.visualize() else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def flow(self): with Flow(self.name) as _flow: sources = source_url.map(self.days) nc_sources = download.map(sources, cache_location=unmapped( self.cache_location)) chunked = chunk(nc_sources, size=self.files_per_chunk) writes = combine_and_write.map(chunked, unmapped(self.target_location), unmapped(self.concat_dim)) consolidate_metadata(writes, self.target_location) return _flow
def _make_flow(specs: Iterable[CopySpec]) -> prefect.Flow: with prefect.Flow("Rechunker") as flow: # iterate over different arrays in the group for spec in specs: copy_tasks = [] # iterate over the different stages of the array copying for (source, target, chunks) in split_into_direct_copies(spec): keys = list(chunk_keys(source.shape, chunks)) copy_task = _copy_chunk.map(prefect.unmapped(source), prefect.unmapped(target), keys) copy_tasks.append(copy_task) # create dependence between stages for n in range(len(copy_tasks) - 1): copy_tasks[n + 1].set_upstream(copy_tasks[n]) return flow
def main(): with Flow("combine-purpleair-sensors") as flow: environment = Parameter("environment", default="staging") start = DateTimeParameter("start") interval_hour = Parameter("interval_hour", default=1) end = DateTimeParameter("end_inclusive") dts = datetime_range(start, interval_hour, end) client = create_purpleair_archive_client(environment) maybe_all_sensors_processed = extract_warehouse_purpleair_processed.map( dt=dts, purpleair_client=unmapped(client)) all_sensors_processed = filter_failed(maybe_all_sensors_processed) combined_sensors = combine_sensors(all_sensors_processed) blob_client = create_combined_sensors_blob_client(environment) load_combined_sensors(combined_sensors, blob_client) # Registers flow to server, which we can then deploy and run in background agents. # flow.register(project_name="caqi-flows") # Immediately executes without agents from datetime import datetime flow.run(start=datetime(2020, 11, 23, 7), end_inclusive=datetime(2020, 11, 25, 8))
def to_prefect(self): """Compile the recipe to a Prefect.Flow object.""" from prefect import Flow, task, unmapped has_cache_inputs = getattr(self, "cache_inputs", False) if has_cache_inputs: cache_input_task = task(self.cache_input, name="cache_input") prepare_target_task = task(self.prepare_target, name="prepare_target") store_chunk_task = task(self.store_chunk, name="store_chunk") finalize_target_task = task(self.finalize_target, name="finalize_target") with Flow("pangeo-forge-recipe") as flow: if has_cache_inputs: cache_task = cache_input_task.map( input_key=list(self.iter_inputs())) upstream_tasks = [cache_task] else: upstream_tasks = [] prepare_task = prepare_target_task(upstream_tasks=upstream_tasks) store_task = store_chunk_task.map( chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)], ) _ = finalize_target_task(upstream_tasks=[store_task]) return flow
def test_map_in_local_flow_run(self, MockFlowView, MockClient): MockFlowView.from_id.return_value.flow_id = "flow-id" with prefect.Flow("test") as flow: create_flow_run.map( flow_id=prefect.unmapped("flow-id"), labels=["a", "b", "c"] ) flow.run() assert MockClient().create_flow_run.call_count == 3 seen_idempotency_keys = set() for i, (expected_label, call) in enumerate( zip(["a", "b", "c"], MockClient().create_flow_run.calls) ): # Label is mapped over _, kwargs = call.args assert kwargs["label"] == expected_label # Idempotency keys are unique assert kwargs["idempotency_key"] not in seen_idempotency_keys seen_idempotency_keys.add(kwargs["idempotency_key"]) # Idempotency keys include map index assert kwargs["idempotency_key"].endswith(f"-{i}")
def _build(self, *, base_url=DEFAULT_BASE_URL, form_id=None, **kwargs): fetch = FetchResponses( base_url=base_url, form_id=form_id, force=True, # this task should always run! ) save = SaveResponse(form_id=form_id) get_token = GetItem(name='GetResponseID', ) get_user_hash = GetUserHash() add_protocol_metadata = AddStaticMetadata( new_meta={ 'protocol': { 'name': 'vr-questionnaire', 'extra': { 'form_id': form_id, }, } }) add_user_metadata = AddDynamicMetadata(key=('omind', 'user_hash'), ) report = Report() notify = SlackTask( preamble='Download of typeform responses finished.\nTask report:') with self: responses = fetch() response_id = get_token.map(task_result=responses, key=unmapped('response_id')) user_hash = get_user_hash.map(response=responses) files = save.map(response=responses, response_id=response_id) files_with_protocol = add_protocol_metadata.map(file=files) files_with_hash = add_user_metadata.map(file=files_with_protocol, value=user_hash) message = report(files=files_with_hash) notify(message=message)
def test_read_vault_secret(mocker): # noqa: F811 mocker.patch.object(vault_secrets, 'open') mocker.patch.object(vault_secrets.hvac, 'Client') with Flow("test") as f: secret_val = vault_secrets.VaultKVSecret( path="warehouses/test_platform/test_secret", version=2) get_val(unmapped(secret_val)) state = f.run() assert state.is_successful()
def get_task_kwargs(op, ref, maps): new_kwargs = {} for k, v in op.get('kwargs', {}).items(): if isinstance(v, str) and v.startswith(':'): v = ref[v[1:]] else: v = Constant(v) if k not in maps: v = unmapped(v) new_kwargs[k] = v return new_kwargs
def main(): with Flow("reprocess-purpleair") as flow: environment = Parameter("environment", default="staging") start = DateTimeParameter("start") interval_hour = Parameter("interval_hour", default=1) end = DateTimeParameter("end_inclusive") dts = datetime_range(start, interval_hour, end) client = create_purpleair_archive_client(environment) all_sensors_raw = extract_warehouse_purpleair.map( dt=dts, purpleair_client=unmapped(client)) all_sensors_processed = transform_all_sensors_raw.map(all_sensors_raw) blob_client = create_hour_blob_client.map( environment=unmapped(environment), dt=dts) load_all_sensors_processed.map(all_sensors_processed, blob_client) # Registers flow to server, which we can then deploy and run in background agents. flow.register(project_name="caqi-flows")
def pipelines_to_plan(self, pipelines: ParallelPipelines) -> Flow: with Flow("rechunker") as flow: for pipeline in pipelines: upstream_tasks = [] # type: List[task] for stage in pipeline.stages: stage_task = task(stage.function, name=stage.name) if stage.mappable is not None: stage_task_called = stage_task.map( list(stage.mappable ), # prefect doesn't accept a generator config=unmapped(pipeline.config), upstream_tasks=[ unmapped(t) for t in upstream_tasks ], ) else: stage_task_called = stage_task( config=pipeline.config, upstream_tasks=upstream_tasks) upstream_tasks = [stage_task_called] return flow
def run(src_dir, dst_dir, debug=False): src_dir = Parameter("src_dir", src_dir) # create destination create_dir(dst_dir) dst_dir = Parameter("dst_dir", dst_dir) with Flow("convert_pipeline") as flow: # load data tiff_paths = find_src_files(src_dir, "tif") info = preload_array_info(tiff_paths) raw_data = read_tiff.map(tiff_paths, unmapped(info)) # save as zarr for faster access zarr_paths = build_path.map(unmapped(dst_dir), tiff_paths, unmapped("zarr")) zarr_paths = write_zarr.map(zarr_paths, raw_data, unmapped("raw")) # convert h5_paths = build_path.map(unmapped(dst_dir), zarr_paths, unmapped("h5")) zarr_to_h5.map(zarr_paths, h5_paths) if debug: flow.visualize() else: client = get_client() executor = DaskExecutor(address=client.scheduler.address) flow.run(executor=executor)
def get_flow(): with Flow(name='backfill-flow') as flow: start_date = Parameter('start_date', default='2020-01-01') end_date = Parameter('end_date', default='2020-02-01') tick_type = Parameter('tick_type', default='trades') symbols = Parameter('symbols', default=['GLD']) symbol_date_list = get_remaining_symbol_dates(start_date, end_date, symbols, tick_type) backfill_date_task_result = backfill_date_task.map( symbol_date=symbol_date_list, tick_type=unmapped(tick_type)) return flow
def flow(self): with Flow(self.name) as flow: # Use map the `source_url` task to each day. This returns a mapped output, # a list of string URLS. See # https://docs.prefect.io/core/concepts/mapping.html#prefect-approach # for more. We'll have one output URL per day. sources = source_url.map(self.days) # Map the `download` task (provided by prefect) to download the raw data # into a cache. # Mapped outputs (sources) can be fed straight into another Task.map call. # If an input is just a regular argument that's not a mapping, it must # be wrapepd in `prefect.unmapped`. # https://docs.prefect.io/core/concepts/mapping.html#unmapped-inputs # nc_sources will be a list of cached URLs, one per input day. nc_sources = download.map(sources, cache_location=unmapped( self.cache_location)) # The individual files would be a bit too small for analysis. We'll use # pangeo_forge.utils.chunk to batch them up. We can pass mapped outputs # like nc_sources directly to `chunk`. chunked = pangeo_forge.utils.chunk(nc_sources, size=5) # Combine all the chunked inputs and write them to their final destination. writes = combine_and_write.map( chunked, unmapped(self.target_location), append_dim=unmapped("time"), concat_dim=unmapped("time"), ) # Consolidate the metadata for the final dataset. consolidate_metadata(self.target_location, writes=writes) return flow
def _build(self, *, base_url=DEFAULT_BASE_URL, form_id=None, **kwargs): required_families = dict( iguazu=None, omind=None, protocol=None, standard=None, ) families = kwargs.get('families', {}) or { } # Could be None by default args for name in required_families: families.setdefault(name, required_families[name]) kwargs['families'] = families # When the query is set by kwargs, leave the query and dialect as they # come. Otherwise, set to the default defined just above if not kwargs.get('query', None): kwargs['query'] = self.DEFAULT_QUERY kwargs['dialect'] = 'postgresql_json' # First part of this flow: obtain a dataset of files dataset_flow = GenericDatasetFlow(**kwargs) json_files = dataset_flow.terminal_tasks().pop() self.update(dataset_flow) create_flow_metadata = CreateFlowMetadata(flow_name=self.REGISTRY_NAME) read_json = LoadJSON() read_form = GetForm(form_id=form_id, base_url=base_url) extract_scores = ExtractScores( output_hdf5_key='/iguazu/features/typeform/subject', ) # TODO: propagate metadata when the branch that has that task is merged propagate_metadata = PropagateMetadata( propagate_families=['omind', 'protocol']) update_flow_metadata = UpdateFlowMetadata(flow_name=self.REGISTRY_NAME) with self: create_noresult = create_flow_metadata.map(parent=json_files) form = read_form() responses = read_json.map(file=json_files, upstream_tasks=[create_noresult]) scores = extract_scores.map(parent=json_files, response=responses, form=unmapped(form)) scores_with_metadata = propagate_metadata.map(parent=json_files, child=scores) _ = update_flow_metadata.map(parent=json_files, child=scores_with_metadata)
def to_prefect(self): """Compile the recipe to a Prefect.Flow object.""" from prefect import Flow, task, unmapped # TODO: allow recipes to customize which stages to run cache_input_task = task(self.cache_input, name="cache_input") prepare_target_task = task(self.prepare_target, name="prepare_target") store_chunk_task = task(self.store_chunk, name="store_chunk") finalize_target_task = task(self.finalize_target, name="finalize_target") with Flow("pangeo-forge-recipe") as flow: cache_task = cache_input_task.map(input_key=list(self.iter_inputs())) upstream_tasks = [cache_task] prepare_task = prepare_target_task(upstream_tasks=upstream_tasks) store_task = store_chunk_task.map( chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)], ) _ = finalize_target_task(upstream_tasks=[store_task]) return flow
def flow(self): with Flow(self.name, storage=self.storage, environment=self.environment) as _flow: # download to cache nc_sources = download.map( self.sources, cache_location=unmapped(self.cache_location), ) first = True write_tasks = [] for source_group in chunked_iterable(nc_sources, self.files_per_chunk): write_task = combine_and_write(source_group, self.target_location, self.concat_dim, first=first) write_tasks.append(write_task) first = False cm = consolidate_metadata(target_path) return _flow
"Meta Data": meta_data, "Time Series (15min)": data }) @task def persist_data_in_influx(injector: Injector, av_response: InterdayResponseModel, secrets: Dict[str, str]): influx_v2_client = injector.get(InfluxDBClient) influx_v2_client.write_api(SYNCHRONOUS).write( secrets['INFLUX_V2_BUCKET'], record=interday_response_model_to_points(av_response)) schedule = IntervalSchedule(interval=timedelta(hours=24)) with Flow("scrap-stock", schedule) as flow: injector = create_secret_injector_task() token_renewal_result = renew_token_task(injector) secrets = fetch_secret_task('common', 'kv', injector) stocks = Parameter("stocks", default=["GOOGL", "MSFT"]) av_response = scrap_stock.map(stocks, secrets=unmapped(secrets)) persist_data_in_influx.map(injector=unmapped(injector), av_response=av_response, secrets=unmapped(secrets)) flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py") flow.run()
def generate_movies( img: Union[str, Path], distributed_executor_port: Optional[Union[str, int]] = None, save_path: Optional[Union[str, Path]] = None, operating_dim: str = Dimensions.Time, overwrite: bool = False, fps: int = 12, quality: int = 6, save_format: str = "mp4", save_workflow: bool = False, normalization_func: Callable = single_channel_percentile_norm, normalization_kwargs: Dict[str, Any] = {}, projection_func: Callable = single_channel_max_project, projection_kwargs: Dict[str, Any] = {}, S: Optional[Union[int, slice]] = None, C: Optional[Union[int, slice]] = None, B: Union[int, slice] = 0, ) -> Path: """ Generate a movie for every scene and channel pair found in a file through an operating dimension. Parameters ---------- img: Union[str, Path] Path to a CZI file to read and generate movies for. distributed_executor_port: Optional[Union[str, int]] If provided a port to use for connecting to the distributed scheduler. All image computation and workflow tasks will be distributed using Dask. Default: None save_path: Optional[Union[str, Path]] A specific path to save the generated movies to. Default: The a directory named after the provided file. operating_dim: str Which dimension to operating through for each frame of the movie. Default: Dimensions.Time ("T") overwrite: bool Should existing files found under the same directory name be overwritten. Default: False fps: int Frames per second of each produces movie. Default: 12 quality: int ImageIO's compression system. 0 is high compression, 10 is no compression. Default: 6 save_format: str Which movie format should be used for each produced file. Default: mp4 Available: mov, avi, mpg, mpeg, mp4, mkv, wmv save_workflow: bool Optionally, save a PNG and PDF of the workflow that ran. If this is set to True, be sure you have installed graphviz and added it's executable to your PATH. Default: False normalization_func: Callable A function to normalize the entire movie data prior to projection. Default: timelapse_tools.normalization.single_channel_percentile_norm normalization_kwargs: Dict[str, Any] Any extra arguments to pass to the normalization function. Default: {} projection_func: Callable A function to project the data for at each frame of the movie. Default: timelapse_tools.projection.single_channel_max_project projection_kwargs: Dict[str, Any] Any extra arguments to pass to the projection function. Default: {} S: Optional[Union[int, slice]] A specific integer or slice to use for selecting down the scenes to process. Default: None (process all scenes) C: Optional[Union[int, slice]] A specific integer or slice to use for selecting down the channels to process. Default: None (process all channels) B: Union[int, slice] A specific integer or slice to use for selecting down the channels to process. Default: 0 Returns ------- save_path: Path The path to the produced scene-channel pairings of movies. """ if distributed_executor_port: from prefect.engine.executors import DaskExecutor executor = DaskExecutor( address=f"tcp://localhost:{distributed_executor_port}") else: from prefect.engine.executors import LocalExecutor executor = LocalExecutor() # Run all processing through prefect + dask for better # parallelization and task optimization with Flow("czi_to_mp4_conversion") as flow: # Convert img to Path img = Path(img).expanduser().resolve(strict=True) # Determine save path save_path = _get_save_path(save_path=save_path, overwrite=overwrite, fname=img.with_suffix("").name) # Setup and check image and operating dimension provided img_details = _img_prep( img=img, operating_dim=operating_dim, # Don't run if save path checking failed upstream_tasks=[save_path], ) # Select scene data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.Scene, dim_indicies_selected=S, ) # Select channel data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.Channel, dim_indicies_selected=C, ) # Select 'B' data img_details = _select_dimension( img=img_details[0], dims=img_details[1], dim_name=Dimensions.B, dim_indicies_selected=B, ) # Generate all the indicie sets we will need to process getitem_indicies = _generate_getitem_indicies( img_shape=_get_image_shape(img_details[0]), dims=img_details[1]) # Generate all the movie selections to_process = _generate_process_list(img=img_details[0], getitem_indicies=getitem_indicies) # Generate a list of dictionaries that map dimension to selected data selected_indices = _generate_selected_dims_list( dims=img_details[1], getitem_indicies=getitem_indicies) # Generate movies for each _generate_movie.map( data=to_process, selected_indices=selected_indices, dims=unmapped(img_details[1]), operating_dim=unmapped(operating_dim), save_path=unmapped(save_path), fps=unmapped(fps), save_format=unmapped(save_format), normalization_func=unmapped(normalization_func), normalization_kwargs=unmapped(normalization_kwargs), projection_func=unmapped(projection_func), projection_kwargs=unmapped(projection_kwargs), ) # Run the flow state = flow.run(executor=executor) # Get resulting path save_path = state.result[flow.get_tasks(name="_get_save_path")[0]].result # Save the flow viz to the same save_path if save_workflow: flow.visualize(filename=str(save_path / "workflow.png")) return save_path
result_handler=GCSResultHandler(bucket='prefect_results')) as flow: _url = Parameter("url", default='http://www.insidethex.co.uk/') _bypass = Parameter("bypass", default=False, required=False) _db_file = Parameter("db_file", default='xfiles_db.sqlite', required=False) # scrape the website _home_page = retrieve_url(_url) _episodes = create_episode_list(base_url=_url, main_html=_home_page, bypass=_bypass) _episode = retrieve_url.map(_episodes) _dialogue = scrape_dialogue.map(_episode) # insert into SQLite table _db = create_db(filename=_db_file) _final = insert_episode.map(episode=_dialogue, tbl=unmapped(_db)) if __name__ == '__main__': # debug the local execution of the flow import sys import argparse from prefect.utilities.debug import raise_on_exception # get any CLI arguments parser = argparse.ArgumentParser() parser.add_argument('--visualize', required=False, default=False) parser.add_argument('--deploy', required=False, default=False) p = parser.parse_args(sys.argv[1:]) if p.visualize: # view the DAG
}, python_dependencies=[ "python-dotenv", "boto3", "botocore", ], ignore_healthchecks=True, # only an extreme poweruser should use this ^ ) run_config = DockerRun( env={"sample_key": "sample_value"}, labels=["docker"], ) with Flow( "Upload to S3", storage=storage, run_config=run_config ) as flow: files_to_download = Parameter( name="File List", default=["data/test_data.csv", "data/user_data.csv", "data/event_data.csv"] ) conn = connect_to_s3() upload_to_s3.map( s3_client=unmapped(conn), file_path=create_filepath.map(files_to_download) ) # flow.run() flow.register(project_name="AWS")
def create_event_index_pipeline( config: EventIndexPipelineConfig, n_grams: int = 1, store_local: bool = False, ) -> Flow: """ Create the Prefect Flow object to preview, run, or visualize for indexing all events in the database. Parameters ---------- config: EventIndexPipelineConfig Configuration options for the pipeline. n_grams: int N number of terms to act as a unique entity. Default: 1 store_local: bool Should the generated index be stored locally to disk or uploaded to database. Storing the local index is useful for testing search result rankings with the `search_cdp_events` bin script. Default: False (store to database) Returns ------- flow: Flow The constructed CDP Event Index Pipeline as a Prefect Flow. """ with Flow("CDP Event Index Pipeline") as flow: # Ensure stopwords are downloaded # Do this once to ensure that we don't enter a race condition # with multiple workers trying to download / read overtop one another # later on. try: from nltk.corpus import stopwords stopwords.words("english") except LookupError: import nltk nltk.download("stopwords") log.info("Downloaded nltk stopwords") from nltk.corpus import stopwords stopwords.words("english") # Get all transcripts all_transcripts = get_transcripts( credentials_file=config.google_credentials_file) # Select highest confidence transcript for each session selected_transcripts = get_highest_confidence_transcript_for_each_session( transcripts=all_transcripts) # Get all transcripts for each event (multi-session events) event_transcripts = get_transcripts_per_event( transcripts=selected_transcripts) # Read all transcripts for each event and generate grams all_event_transcript_n_grams = read_transcripts_and_generate_grams.map( event_transcripts=event_transcripts, n_grams=unmapped(n_grams), credentials_file=unmapped(config.google_credentials_file), ) # Convert to dataframe for tfidf calc all_events_n_grams = convert_all_n_grams_to_dataframe( all_events_n_grams=all_event_transcript_n_grams, ) # Weighted n grams by tfidf scored_n_grams = compute_tfidf( n_grams=all_events_n_grams, datetime_weighting_days_decay=config.datetime_weighting_days_decay, ) # Route to local storage task or remote bulk upload if store_local: store_local_index(n_grams_df=scored_n_grams, n_grams=n_grams) # Route to remote database storage else: chunked_scored_n_grams = chunk_n_grams(scored_n_grams) store_n_gram_chunk.map( n_gram_chunk=chunked_scored_n_grams, credentials_file=unmapped(config.google_credentials_file), ) return flow
with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl: tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html") imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst") n_conn = Parameter("n_conn", default=1) # Scrape the website tsx_imb_df = get_tsx_moc_imb(tsx_url) # Get the connection string from prefect cloud conn_str = PrefectSecret("moc_pgdb_conn") # Partition the df to tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn) df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str)) if __name__ == "__main__": # Inputs tsx_url = 'https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html' backup_url = "https://web.archive.org/web/20200414202757/https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html" # Script from prefect.engine.executors import LocalExecutor tsx_imb_fl.visualize() fl_state = tsx_imb_fl.run( parameters=dict( tsx_url=backup_url, n_conn=4
def _define_model_selection_flow(): """Define flow that runs model selection. Specifically data filtering, partitioning and model selection and optional persistence on a given dataset Returns ------- prefect.Flow """ from prefect import task, Flow, Parameter, unmapped with Flow("model selection") as flow: df = Parameter("data") grid_search = Parameter("grid_search") target_col_name = Parameter("target_col_name") country_code_column = Parameter("country_code_column") include_rules = Parameter("include_rules") exclude_rules = Parameter("exclude_rules") parallel_over_columns = Parameter("parallel_over_columns") partition_columns = Parameter("partition_columns") frequency = Parameter("frequency") output_path = Parameter("output_path") persist_cv_data = Parameter("persist_cv_data") persist_cv_results = Parameter("persist_cv_results") persist_model_reprs = Parameter("persist_model_reprs") persist_best_model = Parameter("persist_best_model") persist_partition = Parameter("persist_partition") persist_model_selector_results = Parameter( "persist_model_selector_results") df_filtered = task(filter_data)(df=df, include_rules=include_rules, exclude_rules=exclude_rules) partitions = task(partition_data)(df=df_filtered, partition_by=parallel_over_columns) parallel_over_dicts, partition_dfs = partitions["labels"], partitions[ "data"] train_data = task(prepare_data_for_training).map( df=partition_dfs, frequency=unmapped(frequency), partition_columns=unmapped(partition_columns), parallel_over_columns=unmapped(parallel_over_columns), country_code_column=unmapped(country_code_column), ) results = task(select_model).map( df=train_data, target_col_name=unmapped(target_col_name), grid_search=unmapped(grid_search), partition_columns=unmapped(partition_columns), parallel_over_dict=parallel_over_dicts, frequency=unmapped(frequency), country_code_column=unmapped(country_code_column), ) write_ok = task(persist_experts_in_physical_partition).map( results=results, folder_path=unmapped(output_path), persist_cv_results=unmapped(persist_cv_results), persist_cv_data=unmapped(persist_cv_data), persist_model_reprs=unmapped(persist_model_reprs), persist_best_model=unmapped(persist_best_model), persist_partition=unmapped(persist_partition), persist_model_selector_results=unmapped( persist_model_selector_results), ) flow.set_reference_tasks([write_ok]) return flow
def DoNotLikeEven(Exception): pass @task(name="multiply input if even", max_retries=1, retry_delay=timedelta(seconds=5)) def transform(x: int, factor: int) -> int: """Multiply the input by `factor`""" if (x % 2) == 0: raise DoNotLikeEven(f'Do not like even numbers and received {x}') return x * factor @task(trigger=some_successful(at_least=1, at_most=6), state_handlers=[slack_notifier]) def load(data: list): """Print the data to indicate it was received""" print("Here's your output data: {}".format(data)) # Set dependency graph with Flow('ETL') as flow: e = extract() t = transform.map(e, unmapped(factor)) l = load(t) # with prefect.context(secrets=dict(SLACK_WEBHOOK_URL="https://hooks.slack.com/services/XXX" )): # flow.run()quit()
def download_cdp_dataset(args: Args): # Try running the download pipeline try: # Get instance config instance_config = getattr(configs, args.instance_name.upper()) # Create connection to instance cdp_instance = CDPInstance(instance_config) # Get speaker annotated transcripts sats = cdp_instance.database.select_rows_as_list( "transcript", [("confidence", 0.97)]) # Spawn local dask cluster cluster = LocalCluster() # Log dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("get_dataset") as flow: # Download videos video_paths = _download_video.map( [sat["event_id"] for sat in sats], unmapped(cdp_instance.database), unmapped(args.save_dir), unmapped(args.overwrite)) # Split audio from video audio_paths = _split_audio_from_video.map(video_paths, unmapped(args.overwrite)) # Download transcripts transcript_paths = _download_transcript.map( [sat["event_id"] for sat in sats], unmapped(cdp_instance.database), unmapped(cdp_instance.file_store), unmapped(args.save_dir), unmapped(args.overwrite)) # Create large audio manifest events = _generate_initial_download_manifest( [sat["event_id"] for sat in sats], video_paths, audio_paths, transcript_paths, args.save_dir) # Generate sentence splits manifests = _generate_splits.map(events, unmapped(args.overwrite)) # Generate splits manifest _generate_splits_manifest(manifests, unmapped(args.save_dir)) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) # Log resulting manifest manifest_save_path = (state.result[flow.get_tasks( name="_generate_splits_manifest")[0]].result) log.info(f"Dataset manifest stored to: {manifest_save_path}") # Catch any exception except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)