def test_add_flow_to_github_storage(): storage = GitHub(repo="test/repo", path="flow.py") f = Flow("test") assert f.name not in storage assert storage.add_flow(f) == "flow.py" assert f.name in storage
def test_serialize_github_storage(): storage = GitHub(repo="test/repo", path="flow.py", secrets=["auth"]) serialized_storage = storage.serialize() assert serialized_storage["type"] == "GitHub" assert serialized_storage["repo"] == "test/repo" assert serialized_storage["path"] == "flow.py" assert serialized_storage["secrets"] == ["auth"]
def test_github_access_token_errors_if_provided_and_not_found(monkeypatch): mock_github = MagicMock(wraps=github.Github) monkeypatch.setattr("github.Github", mock_github) storage = GitHub(repo="test/repo", path="flow.py", access_token_secret="MISSING") with context(secrets={}): with pytest.raises(Exception, match="MISSING"): storage._get_github_client()
def test_get_flow_missing_repo(github_client, caplog): github_client.get_repo.side_effect = github.UnknownObjectException(404, {}) storage = GitHub(repo="test/repo", path="flow.py") storage.add_flow(Flow("test")) with pytest.raises(github.UnknownObjectException): storage.get_flow("test") assert "Repo 'test/repo' not found." in caplog.text
def test_add_flow_to_github_already_added(): storage = GitHub(repo="test/repo", path="flow.py") f = Flow("test") assert f.name not in storage assert storage.add_flow(f) == "flow.py" assert f.name in storage with pytest.raises(ValueError): storage.add_flow(f)
def test_create_github_storage_init_args(): storage = GitHub(repo="test/repo", path="flow.py", ref="my_branch", secrets=["auth"]) assert storage assert storage.flows == dict() assert storage.repo == "test/repo" assert storage.path == "flow.py" assert storage.ref == "my_branch" assert storage.secrets == ["auth"]
def test_github_client_property(monkeypatch): github = MagicMock() monkeypatch.setattr("prefect.utilities.git.Github", github) storage = GitHub(repo="test/repo", path="flow.py") credentials = "ACCESS_TOKEN" with context(secrets=dict(GITHUB_ACCESS_TOKEN=credentials)): github_client = storage._github_client assert github_client github.assert_called_with("ACCESS_TOKEN")
def test_github_access_token_secret(monkeypatch, secret_name, secret_arg): orig_github = github.Github mock_github = MagicMock(wraps=github.Github) monkeypatch.setattr("github.Github", mock_github) storage = GitHub(repo="test/repo", path="flow.py", access_token_secret=secret_arg) with context(secrets={secret_name: "TEST-VAL"}): client = storage._get_github_client() assert isinstance(client, orig_github) assert mock_github.call_args[0][0] == "TEST-VAL"
def test_get_flow_github(monkeypatch): f = Flow("test") github = MagicMock() monkeypatch.setattr("prefect.utilities.git.Github", github) monkeypatch.setattr( "prefect.storage.github.extract_flow_from_file", MagicMock(return_value=f), ) with pytest.raises(ValueError): storage = GitHub(repo="test/repo") storage.get_flow() storage = GitHub(repo="test/repo", path="flow") assert f.name not in storage flow_location = storage.add_flow(f) new_flow = storage.get_flow(flow_location, ref="my_branch") assert new_flow.run()
def test_get_flow_missing_file(github_client, ref, caplog): repo = github_client.get_repo.return_value repo.get_contents.side_effect = github.UnknownObjectException(404, {}) storage = GitHub(repo="test/repo", path="flow.py", ref=ref) storage.add_flow(Flow("test")) ref = ref or "main" with pytest.raises(github.UnknownObjectException): storage.get_flow("test") assert f"File 'flow.py' not found in repo 'test/repo', ref {ref!r}" in caplog.text
def test_github_base_url(monkeypatch): orig_github = github.Github mock_github = MagicMock(wraps=github.Github) monkeypatch.setattr("github.Github", mock_github) storage = GitHub( repo="test/repo", path="flow.py", access_token_secret="TEST", base_url="https://some-url", ) with context(secrets={"TEST": "TEST-VAL"}): client = storage._get_github_client() assert isinstance(client, orig_github) assert mock_github.call_args[1]["base_url"] == "https://some-url"
def test_get_flow_github(monkeypatch): f = Flow("test") github = MagicMock() monkeypatch.setattr("prefect.utilities.git.Github", github) extract_flow_from_file = MagicMock(return_value=f) monkeypatch.setattr("prefect.storage.github.extract_flow_from_file", extract_flow_from_file) storage = GitHub(repo="test/repo", path="flow", ref="my_branch") assert f.name not in storage storage.add_flow(f) new_flow = storage.get_flow(f.name) assert extract_flow_from_file.call_args[1]["flow_name"] == f.name assert new_flow.run()
def test_get_flow(github_client, ref, caplog): storage = GitHub(repo="test/repo", path="flow.py", ref=ref) storage.add_flow(Flow("test")) f = storage.get_flow("test") assert github_client.get_repo.call_args[0][0] == "test/repo" repo = github_client.get_repo.return_value assert repo.get_commit.call_args[0][0] == ref or "main" assert repo.get_contents.call_args[0][0] == "flow.py" assert repo.get_contents.call_args[1]["ref"] == "mycommitsha" assert f.name == "test" state = f.run() assert state.is_successful() msg = "Downloading flow from GitHub storage - repo: 'test/repo', path: 'flow.py'" if ref is not None: msg += f", ref: {ref!r}" assert msg in caplog.text assert "Flow successfully downloaded. Using commit: mycommitsha" in caplog.text
# flows/my_flow.py from prefect import task, Flow from prefect.executors.dask import DaskExecutor from prefect.storage import GitHub from prefect.run_configs import KubernetesRun @task def get_data(): return [1, 2, 3, 4, 5] @task def print_data(data): print(data) with Flow("file-based-flow", executor=DaskExecutor("tcp://dask-scheduler:8786"), run_config=KubernetesRun(), storage=GitHub( repo="pheadra/prefect", # name of repo path="flows/my_flow.py", # location of flow file in repo ref="main" )) as flow: data = get_data() print_data(data)
"""Multiply the input by 10""" return datum * 10 @task def load(data): """Print the data to indicate it was received""" print("Here's your data: {}".format(data)) # Some configuration is required, see https://docs.prefect.io/orchestration/flow_config/overview.html with Flow( "ETL", storage=GitHub( repo="dylanbhughes/pgr_examples_2", path="my_flow.py", secrets=["GITHUB_ACCESS_TOKEN"], ref=os.environ["PREFECT_FLOW_BRANCH_NAME"], ), run_config=DockerRun( image="prefecthq/prefect:latest", labels=[os.environ["PREFECT_FLOW_LABEL"]], env={ "PREFECT_FLOW_BRANCH_NAME": os.environ["PREFECT_FLOW_BRANCH_NAME"], "PREFECT_FLOW_LABEL": os.environ["PREFECT_FLOW_LABEL"], "PREFECT_PROJECT_NAME": os.environ["PREFECT_PROJECT_NAME"], }, ), executor=LocalDaskExecutor(scheduler="threads", num_workers=3), ) as flow: e = extract()
project_name=prefect_project_name) } # Create a prefect's flow object with some configuration flow_nwp_00 = create_flow_download(run=00, **settings) flow_nwp_12 = create_flow_download(run=12, **settings) flow_list = [flow_nwp_00, flow_nwp_12] for flow in flow_list: # Configure how this code will be passed to the prefect agents # In this case, prefect will get this file from github repo_ref = os.getenv("DATAFETCH__STORAGE__REPO__REF", default="master") print(f"Registering Using GitHub repo ref {repo_ref}") flow.storage = GitHub( repo="steph-ben/datafetch-config", # name of repo ref=repo_ref, path="projects/gfs/fetch.py", # location of flow file in repo secrets=["GITHUB_ACCESS_TOKEN"] # name of personal access token secret ) # Configure how this code will be executed # In this case, prefect will run this inside a docker container flow.run_config = DockerRun(image="stephben/datafetch") # Configure where tasks status will be stored # See : # - https://docs.prefect.io/core/concepts/results.html # - https://docs.prefect.io/core/advanced_tutorials/using-results.html flow.result = PrefectResult() if __name__ == "__main__": show_prefect_cli_helper(flow_list=flow_list)
schedule = IntervalSchedule(interval=timedelta(minutes=30)) with Flow("Configurable Mapper", schedule=schedule) as flow: count = Parameter("count", default=10) sleep_length = Parameter("sleep_length", default=10) i = CreateIterable()(count=count) node1_1 = Node( name="Mapped Node", task_run_name=lambda **kwargs: emojize( f"{convert(kwargs['i'], to='ordinal')} child {emojis[kwargs['i'] if kwargs['i'] <= len(emojis) else random.randint(0, len(emojis) - 1)]}", use_aliases=True, variant="emoji_type", ), ).map(i=i, sleep_length=unmapped(sleep_length)) flow.environment = LocalEnvironment( labels=[], executor=LocalDaskExecutor(scheduler="threads", num_workers=6), ) flow.storage = GitHub( repo="znicholasbrown/project-schematics", path="flows/Configurable_Mapper.py", access_token_secret="NICHOLAS_GITHUB_ACCESS", ) # flow.run(run_on_schedule=False) flow.register(project_name="PROJECT: Schematics")
def test_create_github_storage(): storage = GitHub(repo="test/repo", path="flow.py") assert storage assert storage.logger
from prefect.tasks.prefect.flow_run import StartFlowRun new_flow = StartFlowRun(flow_name="Skip Flow", project_name="Jenny") @task(name="") def sleep_for_x(x): time.sleep(x) prefect.artifacts.create_link("ftp://ftp-server/my-file.csv") with Flow(name="Start Flow") as flow: x = Parameter('x', default=22, required=True) sleep_for_x(x) new_flow() flow.run_config = KubernetesRun(cpu_request=2, memory_request="2Gi") # flow.run_config = LocalRun( # labels=['runConfig'] # ) flow.storage = GitHub( repo="bestdan/pifect", # name of repo path="src/childFlow.py", # location of flow file in repo access_token_secret= "Jen_Github_token" # name of personal access token secret ) flow.register('Jenny')
@prefect.task def processing1(fp: str): logger = prefect.context.get("logger") logger.info(f"Doing some processing1 on {fp} ...") @prefect.task def processing2(fp: str): logger = prefect.context.get("logger") logger.info(f"Doing some processing2 on {fp} ...") with prefect.Flow("gfs-post-processing", result=PrefectResult()) as flow: fp = prefect.Parameter("fp") p1 = processing1(fp) p2 = processing2(fp) p2.set_upstream(p1) repo_ref = os.getenv("DATAFETCH__STORAGE__REPO__REF", default="master") print(f"Registering Using GitHub repo ref {repo_ref}") flow.storage = GitHub(repo="steph-ben/datafetch-config", ref=repo_ref, path="projects/gfs/post_process.py", secrets=["GITHUB_ACCESS_TOKEN"]) flow.run_config = DockerRun() if __name__ == "__main__": from datafetch.utils import show_prefect_cli_helper show_prefect_cli_helper(flow_list=[flow])
# schedule = IntervalSchedule(interval=datetime.timedelta(minutes=1)) # stopping schedule to test faster # interval schedule here means flow will run in every 1 min ######## # we have state handler for flow as well so there are two types task state handlers and flow state handlers with Flow("my etl flow", state_handlers=[failed_alert]) as f: db_table = create_table() raw = get_complaint_data() parsed = parse_complaint_data( raw ) # functional api (implicitly defining using output of one as input of other) populate_table = store_complaints(parsed) populate_table.set_upstream( db_table) # imperative api (explicitly defining) f.register(project_name="tutorial1") f.storage = GitHub( repo="ashwani021994/Learning-ML", # name of repo path="/test_code_2.py") # Note: # 1.) Trigger failed is subclass of failed class and it occurs when an upstream tasks fails # 2.) we can explicitly make a task give final status as FAIL OR SUCCESS by # using prefect signals instead the way we did above by raise exception # 3.) LocalResultHandler places my result from the task in # /prefect/results directory( i am not able to find prefect folder) data written is a pickle file # 4.) Every time we call f.register a new version of flow is generated and older version appears in archive # 5>) One another advantage of using cache on server is that the result handler writes data to database. # If process dies in middle on local , cache is gone from memory , if similar happens on server , # cache can be retrieved from results on database.
cloud = StartFlowRun( flow_name="ETL - Docker", project_name="PGR Examples", ) local = StartFlowRun( flow_name="ETL - Local", project_name="PGR Examples", ) with Flow( "Orchestrator Flow", storage=GitHub( repo="dylanbhughes/pgr_examples_3", path="orchestrator.py", secrets=["GITHUB_ACCESS_TOKEN"], ), run_config=DockerRun(image="prefecthq/prefect:latest", labels=["pgr docker"]), executor=LocalDaskExecutor(scheduler="threads", num_workers=3), ) as flow: input_string = Parameter(name="input_string", required=True) manual_switch = Parameter(name="cloud_or_local", required=False, default=None) cloud_or_local_result = run_locally_or_in_cloud( input_string=input_string, manual_switch=manual_switch) switch( cloud_or_local_result,
@task def first_task(): my_logger("This is normal data") my_logger("This is sensitive data") return 1 @task def second_task(): my_logger("This is normal data") my_logger("This is sensitive data") return 1 @task def third_task(): from prefect.engine.signals import FAIL raise FAIL(message=my_logger("This is sensitive data")) with Flow("Filtered Logging Demo", storage=GitHub(repo="kmoonwright/utility_flows", path="logging_demo/log_filter.py", access_token_secret="GITHUB_ACCESS_TOKEN")) as flow: first = first_task() second = second_task(upstream_tasks=[first]) third_task(upstream_tasks=[second]) flow.register(project_name="logging-demo")
@task(log_stdout=True) def extract(input_string): print(input_string) return [1, 2, 3, 4, 5, 6] @task def transform(number): return number * 2 @task def load(numbers): print(f"Uploaded {numbers} to Snowflake") with Flow( "ETL - Local", storage=GitHub( repo="dylanbhughes/pgr_examples_3", path="local_flow.py", secrets=["GITHUB_ACCESS_TOKEN"], ), run_config=LocalRun(labels=["pgr local"]), executor=LocalDaskExecutor(scheduler="threads", num_workers=3), ) as flow: input_string = Parameter(name="input_string", required=True) numbers = extract(input_string=input_string) tranformed_numbers = transform.map(numbers) result = load(numbers=tranformed_numbers)
import os from prefect import Flow, task, Parameter from prefect.storage import GitHub from prefect.run_configs import LocalRun @task(log_stdout=True) def greet(name): greeting = os.environ.get("GREETING", "Hello") print(f"{greeting}, {name}!") with Flow("test-github") as flow: name = Parameter("name") greet(name) flow.storage = GitHub("jcrist/prefect-hacking", path="test_github.py") flow.run_config = LocalRun(env={"GREETING": "Hello"})
name="SQL-stuff" # commit: bool = False, ) #-------------------------------------------------------------- # Flow context #-------------------------------------------------------------- with Flow("github_flow") as f: password = EnvVarSecret(prefect.config.sql_server.password_var) logger = prefect.context.get("logger") thing = Parameter("thing", default=["Thing 1"]) d = dog(thing) s = sql_task(password=password) v = view_sql(s) #-------------------------------------------------------------- # Closing Details #-------------------------------------------------------------- f.run_config = LocalRun(env={ "PREFECT__USER_CONFIG_PATH": '/Users/peytonrunyan/TRP/prefect/config.toml' }) f.storage = GitHub(repo="peyton-trp/prefect-test", path="simple_flow.py", secrets=["GITHUB_ACCESS_TOKEN"]) f.register("cat_flow")
def build_example(path): """Build an example located at a specific path. Args: - path (str): the path to the example source file. Returns: - markdown (str): the rendered example in markdown - flows (Dict[str, Flow]): the flows found in the example """ from prefect import Flow from prefect.storage import GitHub from prefect.run_configs import UniversalRun # Use the current commit (if specified in the environment) ref = os.getenv("GIT_SHA", "master") with open(path, "r", encoding="utf-8") as f: contents = f.read() namespace = {} exec(contents, namespace) try: header = namespace["__doc__"] tree = ast.parse(contents) offset = tree.body[1].lineno - 1 except Exception as exc: raise ValueError( f"No docstring header found for example at {path}") from exc flows = {} relpath = os.path.relpath(path, start=ROOT) for f in namespace.values(): if isinstance(f, Flow): f.storage = GitHub("PrefectHQ/prefect", path=relpath, ref=ref) if not f.run_config: f.run_config = UniversalRun() f.run_config.labels.add("prefect-examples") flows[f.name] = f.serialize(build=True) source = "\n".join(contents.splitlines()[offset:]).strip() res = subprocess.run( [sys.executable, path], capture_output=True, check=True, env={"PREFECT__LOGGING__FORMAT": "%(levelname)s | %(message)s"}, ) output = res.stdout.decode("utf-8").strip() register_lines = [ f"prefect register --json https://docs.prefect.io/examples.json" ] for name in sorted(flows): register_lines.append(f" --name {name!r}") register_lines.append(f" --project 'Prefect Examples'") rendered = EXAMPLE_TEMPLATE.format( header=header, source=source, output=output, ref=ref, relpath=relpath, register_cmd=" \\\n".join(register_lines), ).lstrip() return rendered, flows
# flow_name="Random State Generator" # )(task_run_name=num_of_flow_runs) # with Flow("Flow Run Generator") as flow3: # num_of_flows = Parameter("num_of_flows", default=5) # my_flow_runs = generate_list(num_of_flows) # create_flow_runs.map(my_flow_runs) # flow3.storage = Local(add_default_labels=False) # flow3.register(project_name="Demos") # ATTEMPT 3 @task def generate_list(length): return ["Random State Generator" for name in range(length)] create_flow_runs = StartFlowRun(project_name="State Generators", flow_name="Random State Generator") with Flow("Flow Run Generator") as flow3: num_of_flows = Parameter("num_of_flows", default=5) my_flow_runs = generate_list(num_of_flows) create_flow_runs.map(my_flow_runs) flow3.storage = GitHub(repo="kmoonwright/utility_flows", path="state_generators/3_flow_run_generator.py", access_token_secret="GITHUB_ACCESS_TOKEN") flow3.register(project_name="State Generators")
"Meta Data": meta_data, "Time Series (15min)": data }) @task def persist_data_in_influx(injector: Injector, av_response: InterdayResponseModel, secrets: Dict[str, str]): influx_v2_client = injector.get(InfluxDBClient) influx_v2_client.write_api(SYNCHRONOUS).write( secrets['INFLUX_V2_BUCKET'], record=interday_response_model_to_points(av_response)) schedule = IntervalSchedule(interval=timedelta(hours=24)) with Flow("scrap-stock", schedule) as flow: injector = create_secret_injector_task() token_renewal_result = renew_token_task(injector) secrets = fetch_secret_task('common', 'kv', injector) stocks = Parameter("stocks", default=["GOOGL", "MSFT"]) av_response = scrap_stock.map(stocks, secrets=unmapped(secrets)) persist_data_in_influx.map(injector=unmapped(injector), av_response=av_response, secrets=unmapped(secrets)) flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py") flow.run()
states: [{ flow_run_id: $flowRunId, state: $state }] } ) { states { id status message } } } """, variables={ "flowRunId": prev_flow_run_id, "state": { "type": "Skipped" } }) with Flow("Previous Flow Run State Changer") as flow2: t1 = log_prev_flow_run_id() t2 = log_prev_num() secret = PrefectSecret("PERSONAL_ACCESS_TOKEN") t3 = change_prev_flow_state(t2, secret) create_link(prefect.context.get("prev_flow_run_id")) flow2.add_edge(t1, t2) flow2.storage = GitHub(repo="kmoonwright/utility_flows", path="state_generators/2_prev_run_state_changer.py", access_token_secret="GITHUB_ACCESS_TOKEN") flow2.register(project_name="State Generators")