def test_local_result_creates_necessary_dirs(self, tmp_dir): os_independent_template = os.path.join("mydir", "mysubdir", "{thing}.txt") result = LocalResult(dir=tmp_dir, location=os_independent_template) new_result = result.write("so-much-data", thing=42) assert new_result.location == os.path.join("mydir", "mysubdir", "42.txt") assert new_result.value == "so-much-data"
def test_local_result_writes_to_dir(self, tmp_dir, res): result = LocalResult(dir=tmp_dir, location="test.txt") fpath = result.write(res).location assert isinstance(fpath, str) assert fpath.endswith("test.txt") with open(os.path.join(tmp_dir, fpath), "rb") as f: val = f.read() assert isinstance(val, bytes)
def test_copy_appropriately_sets_result_target_if_target_provided(self): # https://github.com/PrefectHQ/prefect/issues/2588 @task(target="target", result=LocalResult(dir=".")) def X(): pass @task def Y(): pass with Flow("test"): x = X() y = Y(task_args=dict(target="target", result=LocalResult(dir="."))) assert x.result.location == "target" assert y.result.location == "target"
def __init__( self, directory: str = None, validate: bool = True, path: str = None, stored_as_script: bool = False, **kwargs: Any, ) -> None: directory = directory or os.path.join(prefect.config.home_dir, "flows") self.flows = dict() # type: Dict[str, str] self._flows = dict() # type: Dict[str, "prefect.core.flow.Flow"] self.path = path if validate: abs_directory = os.path.abspath(os.path.expanduser(directory)) os.makedirs(abs_directory, exist_ok=True) else: abs_directory = directory self.directory = abs_directory result = LocalResult(self.directory, validate_dir=validate) super().__init__(result=result, stored_as_script=stored_as_script, **kwargs)
def test_getitem_preserves_result_info(self): with Flow(name="test") as f: z = Task(checkpoint=False)()[0] y = Task(checkpoint=True, result=LocalResult(dir="home"))[1] assert z.checkpoint is False assert isinstance(y.result, LocalResult) assert y.result.dir.endswith("home")
def test_getattr_preserves_result_info(self): with Flow(name="test") as f: p = Parameter("p") z = GetAttr(checkpoint=False)(p, "foo") y = GetAttr(checkpoint=True, result=LocalResult(dir="home"))(p, "bar") assert z.checkpoint is False assert isinstance(y.result, LocalResult) assert y.result.dir.endswith("home")
def test_doesnt_raise_for_mapped_tasks_with_correctly_specified_result_location( self, location, tmpdir): @task(result=LocalResult(dir=tmpdir, location=location)) def down(x): pass with Flow("upstream-test") as f: result = down.map(x=[1, 2, 3]) assert healthchecks.result_check([f]) is None
def test_raises_for_mapped_tasks_with_poorly_specified_result_location( self, tmpdir): @task(result=LocalResult(dir=tmpdir, location="{task_name}.txt")) def down(x): pass with Flow("upstream-test") as f: result = down.map(x=[1, 2, 3]) with pytest.raises(ValueError, match="filename"): healthchecks.result_check([f])
def load_result(checkpoint_dir: Union[str, Path], date: str, name: str) -> Any: """Loads a Prefct checkpointed result from file for the given date. Args: date (str): date to load the checkpoint from name (str): name of the file (stem, e.g. 'p' if file name is 'p.prefect') Returns: Any """ result_existence = LocalResult(dir=Path(checkpoint_dir).as_posix()).exists( location=Path(date, f'{name}.prefect').as_posix() ) assert ( result_existence ), f'Result must exist, checked {Path(checkpoint_dir, date).as_posix()} for {name}.prefect.' return ( LocalResult(dir=Path(checkpoint_dir).as_posix()) .read(location=Path(date, f'{name}.prefect').as_posix()) .value )
def test(e: Optional[Executor]): with TemporaryDirectory() as tmpdir: flow_result = LocalResult(tmpdir, serializer=JSONSerializer(), location="{task_name}.json") with Flow("write_result", result=flow_result) as f: _terminal = task(lambda: 42, checkpoint=True, name="magic")() with set_temporary_config({"flows.checkpointing": True}), \ raise_on_exception(): f.run(executor=e) files = os.listdir(tmpdir) assert files == ["magic.json"], files with open(os.path.join(tmpdir, files[0]), "rb") as file: val = json.load(file) assert val==42
def __init__( self, directory: str = None, validate: bool = True, **kwargs: Any ) -> None: directory = directory or os.path.join(prefect.config.home_dir, "flows") self.flows = dict() # type: Dict[str, str] self._flows = dict() # type: Dict[str, "prefect.core.flow.Flow"] if validate: abs_directory = os.path.abspath(os.path.expanduser(directory)) if not os.path.exists(abs_directory): os.makedirs(abs_directory) else: abs_directory = directory self.directory = abs_directory result = LocalResult(self.directory, validate_dir=validate) super().__init__(result=result, **kwargs)
def test_task_call_with_self_succeeds(): import dataclasses @dataclasses.dataclass class TestClass: count: int def increment(self): self.count = self.count + 1 seconds_task = task(TestClass.increment, target="{{task_slug}}_{{map_index}}", result=LocalResult()) initial = TestClass(count=0) with Flow("test") as flow: seconds_task(initial) assert flow.run().is_successful()
def test_task_runner_treats_unfound_files_as_invalid_caches(client, tmpdir): @prefect.task(cache_for=datetime.timedelta(minutes=1), result=PrefectResult()) def cached_task(): return 42 state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(minutes=2), result=LocalResult(location=str(tmpdir / "made_up_data.prefect")), ) old_state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(days=1), result=PrefectResult(location="13"), ) client.get_latest_cached_states = MagicMock(return_value=[state, old_state]) res = CloudTaskRunner(task=cached_task).run() assert client.get_latest_cached_states.called assert res.is_successful() assert res.is_cached() assert res.result == 13
def generate_list(): return [1, 2, 3] @task def do_something(n): return n @task def fail(x): print(x) raise ValueError() result = LocalResult(location="{task_full_name}.pb") with Flow( "Restart Me", storage=Local( stored_as_script=True, path="/Users/josh/Desktop/code/Dummy-Flows/restartme.py", ), result=result, ) as flow: lst = generate_list() d = do_something.map(lst) fail(d) environment = LocalEnvironment(executor=DaskExecutor()) flow.environment = environment
player_stats, team_stats).loc[team_rosters[team_name]] def __repr__(self): return f'League {self.league_id}' @classmethod def load_league(cls, file_path: Union[str, Path]): with open(file_path, 'r') as f: league_config = json.load(f) return cls(**league_config) @task( name='Get League Mean Statistics', result=LocalResult( location= "{output_directory}/{date:%m}-{date:%d}-{date:%Y}/league_mean_statistics.prefect" ), checkpoint=True, ) def get_league_mean_statistics(team_stats: pd.DataFrame) -> pd.DataFrame: """Aggregates team aggregated statistics over the whole league. Args: team_stats (pd.DataFrame): multi-indexed dataframe with all teams statistics Returns: pd.DataFrame: 2D dataframes containing traditional mean statistics (e.g. PTS, AST) over * the last 7 days * the last 15 days * the last 30 days
def test_build_and_register(self, capsys, monkeypatch, force): """Build and register a few flows: - 1 new flow - 1 updated flow - 1 skipped flow - 1 error during registration - 2 sharing the same storage (which fails to build properly) - 2 from a pre-built JSON file """ build_call_count = 0 class MyModule(Module): def build(self): nonlocal build_call_count build_call_count += 1 class BadStorage(Module): def build(self): raise ValueError("whoops!") client = MagicMock() register_serialized_flow = MagicMock() register_serialized_flow.side_effect = [ ("new-id-1", 1, True), ("old-id-2", 2, False), ("new-id-3", 3, True), ValueError("Oh no!"), ("new-id-7", 1, True), ("old-id-8", 2, False), ] monkeypatch.setattr( "prefect.cli.build_register.register_serialized_flow", register_serialized_flow, ) storage1 = MyModule("testing") storage1.result = LocalResult() flow1 = Flow("flow 1", storage=storage1, run_config=UniversalRun(labels=["a"])) flow2 = Flow( "flow 2", storage=MyModule("testing"), environment=LocalEnvironment(labels=["a"]), ) storage2 = MyModule("testing") flow3 = Flow("flow 3", storage=storage2) flow4 = Flow("flow 4", storage=storage2) storage3 = BadStorage("testing") flow5 = Flow("flow 5", storage=storage3) flow6 = Flow("flow 6", storage=storage3) flow7 = box.Box( Flow("flow 7", run_config=UniversalRun(labels=["a"])).serialize(build=False)) flow8 = box.Box( Flow("flow 8", environment=LocalEnvironment( labels=["a"])).serialize(build=False)) flows = [flow1, flow2, flow3, flow4, flow5, flow6, flow7, flow8] stats = build_and_register(client, flows, "my-project-id", labels=["b", "c"], force=force) # 3 calls (one for each unique `MyModule` storage object) assert build_call_count == 3 # 6 register calls (8 - 2 that failed to build storage) assert register_serialized_flow.call_count == 6 for flow, (args, kwargs) in zip(flows, register_serialized_flow.call_args_list): assert not args assert kwargs["client"] is client assert kwargs["serialized_flow"] assert kwargs["project_id"] == "my-project-id" assert kwargs["force"] == force # Stats are recorded properly assert dict(stats) == {"registered": 3, "skipped": 2, "errored": 3} # Flows are properly configured assert flow1.result is storage1.result assert flow1.run_config.labels == {"a", "b", "c"} assert flow2.environment.labels == {"a", "b", "c"} assert isinstance(flow3.run_config, UniversalRun) assert flow3.run_config.labels == {"b", "c"} assert isinstance(flow4.run_config, UniversalRun) assert flow4.run_config.labels == {"b", "c"} assert set(flow7["run_config"]["labels"]) == {"a", "b", "c"} assert set(flow8["environment"]["labels"]) == {"a", "b", "c"} # The output contains a traceback, which will vary between machines # We only check that the following fixed sections exist in the output parts = [ (" Building `MyModule` storage...\n" " Registering 'flow 1'... Done\n" " └── ID: new-id-1\n" " └── Version: 1\n" " Building `MyModule` storage...\n" " Registering 'flow 2'... Skipped (metadata unchanged)\n" " Building `MyModule` storage...\n" " Registering 'flow 3'... Done\n" " └── ID: new-id-3\n" " └── Version: 3\n" " Registering 'flow 4'... Error\n" " Traceback (most recent call last):\n"), (" ValueError: Oh no!\n" "\n" " Building `BadStorage` storage...\n" " Error building storage:\n" " Traceback (most recent call last):\n"), (" ValueError: whoops!\n" "\n" " Registering 'flow 5'... Error\n" " Registering 'flow 6'... Error\n" " Registering 'flow 7'... Done\n" " └── ID: new-id-7\n" " └── Version: 1\n" " Registering 'flow 8'... Skipped (metadata unchanged)\n"), ] out, err = capsys.readouterr() assert not err for part in parts: assert part in out
from prefect import Flow, task from prefect.engine.flow_runner import FlowRunner from nlps_extraction.tasks.baselines import ReadPSInput, CustomTokenizeInput from nlps_extraction.tasks.baselines.sbert import GenerateKBSBertTask, EvaluateSBertTask from prefect.engine.results import LocalResult cache_args = dict( target="{task_name}.pkl", checkpoint=True, result=LocalResult(dir=f"./cache/"), ) read_input_files = ReadPSInput() tokenize_data = CustomTokenizeInput() encode_kb_task = GenerateKBSBertTask() evaluation_task = EvaluateSBertTask() MODEL = "sentence-transformers/all-mpnet-base-v2" with Flow("Running S-BERT baselines") as flow: input_files = read_input_files() encoded_kb = encode_kb_task(input_files["kb"], MODEL) evaluation_task(input_files, encoded_kb, model_name=MODEL) FlowRunner(flow=flow).run()
"""Where all of the tasks for our pipeline go """ from pathlib import Path import prefect from config_vars import db_name, host, user, result_folder from prefect import task from prefect.engine.results import LocalResult from prefect.tasks.sql_server import SqlServerFetch from sql import get_manual_override_rows path = Path(__file__).resolve().parent / result_folder result_formatter = LocalResult(dir=path, location="{flow_name}/" "{scheduled_start_time:%d-%m_%H-%M-%S}/" "{task_full_name}-{task_run_id}.prefect_result") # Get our database items sql_task = SqlServerFetch(db_name=db_name, user=user, host=host, query=get_manual_override_rows, fetch='many', fetch_count=3, result=result_formatter, name="SQL-stuff" # commit: bool = False, )
def test_non_keyed_states_are_hydrated_correctly_with_retries( monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies can affect the number of children spawned. """ @prefect.task def return_list(): return [1, 2, 3] @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20)) def fail_once(): if prefect.context.get("task_run_count", 0) < 2: raise SyntaxError("bad") else: return 100 flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = fail_once.map(upstream_tasks=[return_list]) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun( id=task_run_id_2, task_slug=flow.slugs[return_list], flow_run_id=flow_run_id, ), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, return_list] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_successful() # there should be a total of 4 task runs corresponding to each mapped task assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) # t1's first child task should be retrying assert all([ isinstance(tr.state, Retrying) for tr in client.task_runs.values() if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1) ]) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data for idx, tr in client.task_runs.items(): if tr.task_slug == flow.slugs[t1] and tr.map_index != -1: tr.state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) assert all(tr.state.is_successful() for tr in client.task_runs.values())
from typing import Dict, List import numpy as np import pandas as pd import prefect from prefect import task from prefect.engine.results import LocalResult from .teams import Team @task( name='Get Team Rosters', result=LocalResult( location= "{output_directory}/{date:%m}-{date:%d}-{date:%Y}/team_rosters.prefect" ), checkpoint=True, ) def get_team_rosters(season: int, teams: List[Team]) -> Dict[str, List[str]]: """Gets rosters for each team using ESPN's fantasy API. Args: season (int): the season to get roster statistics for teams (List[Team]): the teams in the league to consider Returns: Dict[str, List[str]]: mapping from roster name to roster (list of player names) """ return prefect.context.league.get_team_rosters(season=season, teams=teams)
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly. """ flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = plus_one.map([-1, 0, 1]) t2 = invert_fail_once.map(t1) t2.max_retries = 1 t2.retry_delay = datetime.timedelta(minutes=100) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun(id=task_run_id_2, task_slug=flow.slugs[t2], flow_run_id=flow_run_id), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, t2] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_mapped() # there should be a total of 4 task runs corresponding to each mapped task for t in [t1, t2]: assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t] ]) == 4) # t2's first child task should be retrying t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert isinstance(t2_0.state, Retrying) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data failed_id = [ t_id for t_id, tr in client.task_runs.items() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0 ].pop() client.task_runs[failed_id].state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) # t2's first child task should be successful t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert t2_0.state.is_successful()
from prefect import Flow, task, unmapped, Parameter from prefect.engine.results import LocalResult from prefect.engine.executors import LocalDaskExecutor, DaskExecutor from prefect.engine.cache_validators import all_parameters lr = LocalResult(location="{flow_name}-{task_name}-{x}-{y}.pkl", validators=all_parameters) @task(log_stdout=True, checkpoint=True) def add(x, y): print(f"add ran with {x} {y}") try: return sum(x) + y except TypeError: return x + y with Flow("iterated map", result=lr) as flow: y = unmapped(Parameter("y", default=7)) x = Parameter("x", default=[1, 2, 3]) mapped_result = add.map(x, y=y) out = add(mapped_result, y) if __name__ == "__main__": flow.run(executor=DaskExecutor())
# %% # ------------------------------------------------------- # Pipeline scehduler # ------------------------------------------------------- schedule = IntervalSchedule(interval=dt.timedelta(days=30)) # %% # ------------------------------------------------------- # Build pipeline # ------------------------------------------------------- with Flow(name='malaysia_bank_card_scraping_flow', result=LocalResult(dir="result_config")) as flow: # Step 1: Compile a list of bank names for credit cards. ls_banks_for_card = name_scraping.compile_bank_names_for_card(URL_CARD, '''/html/body/main/section/form/label/select''') # Step 2: Compile a list of credit cards for each bank. dict_cards = name_scraping.compile_credit_cards( upstream_tasks=[ls_banks_for_card], ls_banks=ls_banks_for_card, xpath='''/html/body/main/section/ul''', ) # Step 3: Run the scrapers. df_card = card_scraping.card_scraping_procedure( upstream_tasks=[ls_banks_for_card, dict_cards], url=URL_CARD,
def cat(i: int, result=LocalResult(dir=path)): logger = prefect.context.get("logger") logger.debug(i) return "nine lives"
train_path = settings["rico_sca"]["train"] dev_path = settings["rico_sca"]["dev"] test_path = settings["rico_sca"]["test"] # train_path = settings["sample_rico_sca"] # dev_path = settings["sample_rico_sca"] # # test_path = settings["sample_rico_sca"] # train_path = settings["rico_sca_sample"]["train"] # dev_path = settings["rico_sca_sample"]["dev"] # test_path = settings["rico_sca_sample"]["test"] cache_args = dict( target="{task_name}-{task_tags}.pkl", checkpoint=True, result=LocalResult(dir=f"./cache/datasets/rico/"), ) prepare_rico_task = PrepareRicoScaPair() prepare_rico_layout_lm_task = PrepareLayoutLMPairTask() layout_lm_trainer_task = LayoutLMPair() INSTRUCTION_TYPE = [2] # where: 0 and 3 - Lexical Matching # 1 - Spatial (Relative to screen) # 2 - Spatial (Relative to other elements) with Flow("Running the Transformers for Pair Classification") as flow1: with tags("train"): train_input = prepare_rico_task(train_path, type_instructions=INSTRUCTION_TYPE)
import prefect from prefect import task, Flow from prefect.engine.results import LocalResult # prefect.config.flows.checkpointing = True @task(result=LocalResult(location="test.prefect")) def test(): print("Hello!") return 1 with Flow("test") as flow: test() flow.run()
def test_build_and_register(self, capsys, monkeypatch, force): """Build and register a few flows: - 1 new flow - 1 updated flow - 1 skipped flow - 1 error during registration - 2 sharing the same storage (which fails to build properly) """ build_call_count = 0 class MyModule(Module): def build(self): nonlocal build_call_count build_call_count += 1 class BadStorage(Module): def build(self): raise ValueError("whoops!") client = MagicMock() client.graphql.side_effect = [ GraphQLResult({"data": {"flow": []}}), GraphQLResult({"data": {"flow": [{"id": "old-id-2", "version": 1}]}}), GraphQLResult({"data": {"flow": [{"id": "old-id-3", "version": 2}]}}), GraphQLResult({"data": {"flow": [{"id": "old-id-4", "version": 3}]}}), ] client.register.side_effect = [ "new-id-1", "old-id-2", "new-id-3", ValueError("Oh no!"), ] storage1 = MyModule("testing") storage1.result = LocalResult() flow1 = Flow("flow 1", storage=storage1, run_config=UniversalRun(labels=["a"])) flow2 = Flow( "flow 2", storage=MyModule("testing"), environment=LocalEnvironment(labels=["a"]), ) storage2 = MyModule("testing") flow3 = Flow("flow 3", storage=storage2) flow4 = Flow("flow 4", storage=storage2) storage3 = BadStorage("testing") flow5 = Flow("flow 5", storage=storage3) flow6 = Flow("flow 6", storage=storage3) flows = [flow1, flow2, flow3, flow4, flow5, flow6] stats = build_and_register( client, flows, "testing", labels=["b", "c"], force=force ) # 3 calls (one for each unique `MyModule` storage object) assert build_call_count == 3 # 4 register calls (6 - 2 that failed to build storage) assert client.register.call_count == 4 for flow, (args, kwargs) in zip(flows, client.register.call_args_list): assert not args assert kwargs["flow"] is flow assert kwargs["project_name"] == "testing" assert kwargs["build"] is False assert kwargs["no_url"] is True if force: assert kwargs["idempotency_key"] is None else: assert kwargs["idempotency_key"] # Stats are recorded properly assert dict(stats) == {"registered": 2, "skipped": 1, "errored": 3} # Flows are properly configured assert flow1.result is storage1.result assert flow1.run_config.labels == {"a", "b", "c"} assert flow2.environment.labels == {"a", "b", "c"} assert isinstance(flow3.run_config, UniversalRun) assert flow3.run_config.labels == {"b", "c"} assert isinstance(flow4.run_config, UniversalRun) assert flow4.run_config.labels == {"b", "c"} # The output contains a traceback, which will vary between machines # We only check that the following fixed sections exist in the output parts = [ ( " Building `MyModule` storage...\n" " Registering 'flow 1'... Done\n" " └── ID: new-id-1\n" " └── Version: 1\n" " Building `MyModule` storage...\n" " Registering 'flow 2'... Skipped\n" " Building `MyModule` storage...\n" " Registering 'flow 3'... Done\n" " └── ID: new-id-3\n" " └── Version: 3\n" " Registering 'flow 4'... Error\n" " Traceback (most recent call last):\n" ), ( " ValueError: Oh no!\n" "\n" " Building `BadStorage` storage...\n" " Error building storage:\n" " Traceback (most recent call last):\n" ), ( " ValueError: whoops!\n" "\n" " Registering 'flow 5'... Error\n" " Registering 'flow 6'... Error\n" ), ] out, err = capsys.readouterr() assert not err for part in parts: assert part in out
from typing import Optional from datetime import datetime from pathlib import Path from prefect import task, Flow from prefect.engine.results import LocalResult from pyspark.sql.session import SparkSession from data_source.prefect.tasks import constant from data_source import catalog from data_source.core import entry_key_str # pylint: disable=no-value-for-parameter @task( target="{flow_name}/{task_name}", checkpoint=True, result=LocalResult(dir="~/.prefect"), ) def download(ftp_dir, csv_dir, n_mgrel_files): csv_dir = Path(csv_dir) if not csv_dir.exists(): csv_dir.mkdir(parents=True, exist_ok=True) files = [] for i in range(n_mgrel_files): filename = f'MGREL_{i + 1}.csv.gz' path = str(csv_dir / filename) url = ftp_dir + '/' + filename of = fsspec.open(url) of.fs.download(url, path) files.append(path) return csv_dir
from prefect import Task, Flow, task from prefect.engine.results import LocalResult @task(target=lambda **kwargs: str(kwargs['task_run_count'])) def get_data(): """test""" return "data" @task def print_data(data): print(data) with Flow("using-targets", result=LocalResult(), ) as flow: data = get_data() print_data(data) flow.run()
from prefect.engine.results import LocalResult @task(target="{parameters[val]}") def get_data(val): return [val] @task def print_data(data): print(data) with Flow( "using-targets", result=LocalResult(), ) as flow: val = Parameter("val", default="asdf") data = get_data(val) print_data(data) flow.run() # flow.register(project_name="Demo") # class GetData(Task): # def run(self): # print(1) # GetData() # get_data()