def create_scraper_monitor_flow(): # Run every day at 15 UTC (or 10 ET) schedule = CronSchedule("0 15 * * *") with Flow("MonitorFailingScrapers", schedule) as flow: slack_webhook_url = EnvVarSecret("SLACK_WEBHOOK_URL") run_monitor_scrapers(slack_webhook_url) return flow
def create_flow_for_table(table_name): sched = CronSchedule("50 */2 * * *") tn = f"data.{table_name}" sn = f"{tn}_id_seq" with Flow(f"clean-sql-{table_name}", sched) as flow: connstr = EnvVarSecret("COVID_DB_CONN_URI") ready = truncate_table(connstr, tn) reset_sequence(connstr, sn, ready) return flow
def main(project_name): schedule = CronSchedule('0 * * * *') config = Config() with Flow('Purple Air hourly download flow', schedule) as flow: run_time = get_run_time fetch_result = fetch_results(config) transform_result = transform_results(config, fetch_result) write_results(config, transform_result, run_time) print(flow.register(project_name=project_name)) flow.run_agent()
def create_cdc_all_states_flow(): """Creates a flow that runs the CDC data update on all states.""" sched = CronSchedule("17 */4 * * *") flow = Flow("CDCAllStatesDataUpdate", sched) for state in ALL_STATES_PLUS_DC: task = StartFlowRun( flow_name=CDCCovidDataTracker.__name__, project_name="can-scrape", wait=True, parameters={"state": state.abbr}, ) flow.add_task(task) return flow
def create_main_flow(flows: List[Flow], project_name): schedule = CronSchedule("0 */3 * * *") with Flow("MainFlow", schedule) as main_flow: tasks = [] for flow in flows: task = StartFlowRun(flow_name=flow.name, project_name=project_name, wait=True) tasks.append(task) parquet_flow = StartFlowRun(flow_name="UpdateParquetFiles", project_name=project_name, wait=True) for task in tasks: task.set_downstream(parquet_flow) return main_flow
def test_run_workflow_ignores_schedule(test_logger): """ Test that run_workflow ignores the workflow's schedule. """ function_mock = create_autospec(lambda dummy_param: None) # Flow with no more scheduled runs with prefect.Flow( "Dummy_workflow", schedule=CronSchedule("0 0 * * *", end_date=pendulum.now().subtract(days=2)), ) as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) with prefect.context(logger=test_logger): run_workflow.run( parametrised_workflow=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
def create_flow_for_scraper(ix: int, cls: Type[DatasetBase]): sched = CronSchedule(f"{ix % 60} */4 * * *") with Flow(cls.__name__, sched) as flow: connstr = EnvVarSecret("COVID_DB_CONN_URI") sentry_dsn = EnvVarSecret("SENTRY_DSN") sentry_sdk_task = initialize_sentry(sentry_dsn) d = create_scraper(cls) fetched = fetch(d) normalized = normalize(d) validated = validate(d) done = put(d, connstr) d.set_upstream(sentry_sdk_task) normalized.set_upstream(fetched) validated.set_upstream(normalized) done.set_upstream(validated) return flow
def _deserialize(self, value, attr, data, **kwargs) -> "prefect.schedules.schedules.Schedule": """ Deserialise a cron string as a cron schedule. Returns ------- Schedule Prefect CronSchedule to run a flow according to the schedule defined by the input string. Raises ------ ValidationError if the input value is not a valid cron string or None """ cron_string = super()._deserialize(value, attr, data, **kwargs) try: schedule = CronSchedule(cron_string) except ValueError: raise ValidationError(f"Invalid cron string: '{cron_string}'.") return schedule
@task def frontfill(): now = datetime.datetime.now() pm.execute_notebook( "etl/hn_etl_front_fill.ipynb", "s3://python-portfolio-notebooks/hn_updates/frontfill" + str(now.year) + "-" + str(now.month) + "-" + str(now.day) + ".ipynb", ) @task def test_changes(): now = datetime.datetime.now() pm.execute_notebook( "etl/hn_data_test.ipynb", "s3://python-portfolio-notebooks/hn_updates/test" + str(now.year) + "-" + str(now.month) + "-" + str(now.day) + ".ipynb", ) with Flow("ETL", schedule=CronSchedule("0 9 * * *")) as flow: frontfill = frontfill() backfill = backfill() test_changes = test_changes() if __name__ == "__main__": flow.run()
retry_delay=timedelta(minutes=1), nout=2, trigger=triggers.all_finished, ) def create_parquet(_success): ts = prefect.context.scheduled_start_time dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H") vintage_fn = FN_STR.format(dt_str) + ".parquet" fn = FN_STR.format("") + ".parquet" df = pd.read_csv(CSV_FN, parse_dates=["dt"]) df.to_parquet(DATA_PATH / vintage_fn, index=False) df.to_parquet(DATA_PATH / fn, index=False) return vintage_fn, fn @task def get_gcs_cmd(fn): return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}" shell = ShellTask() with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f: connstr = EnvVarSecret("COVID_DB_CONN_URI") success = export_to_csv(connstr) vintage_fn, fn = create_parquet(success) shell(get_gcs_cmd(vintage_fn)) shell(get_gcs_cmd(fn)) f.register(project_name="can-scrape")
from datetime import datetime, timedelta import pendulum import prefect from prefect import task, Flow from prefect.schedules import IntervalSchedule, CronSchedule import pandas as pd from io import BytesIO import zipfile import requests schedule = CronSchedule(cron="*/10 * * * *", start_date=pendulum.datetime(2020, 11, 25, 18, 40, tz="America/Sao_Paulo")) @task def get_raw_data(): url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip" filebytes = BytesIO(requests.get(url).content) myzip = zipfile.ZipFile(filebytes) myzip.extractall() path = './microdados_enade_2019/2019/3.DADOS/' return path @task
info_keys=["stargazers_count", "subscribers_count"], max_retries=1, retry_delay=datetime.timedelta(minutes=1), ) @task def process_stats(stats): data = { "Stars": stats["stargazers_count"], "Watchers": stats["subscribers_count"], "Date": pendulum.now("utc").isoformat(), } return data airtable = WriteAirtableRow( base_key="XXXXXXX", table_name="Stars", max_retries=1, retry_delay=datetime.timedelta(minutes=1), ) daily_schedule = CronSchedule("*/1 * * * *") with Flow("Collect Repo Stats", schedule=daily_schedule) as flow: data = process_stats(repo_stats) final = airtable(data) flow.run()
fleet_segments, infractions, init_species_groups, last_positions, ports, species, vessels, ) ################################ Define flow schedules ################################ control_anteriority.flow.schedule = IntervalSchedule(interval=timedelta( hours=1)) current_segments.flow.schedule = IntervalSchedule(interval=timedelta( minutes=10)) ers.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1)) fishing_gear_codes.flow.schedule = CronSchedule("0 3 * * *") last_positions.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1)) species.flow.schedule = CronSchedule("0 4 * * *") vessels.flow.schedule = CronSchedule("0 2 * * *") ###################### List flows to register with prefect server ##################### flows_to_register = [ controllers.flow, controls.flow, control_anteriority.flow, current_segments.flow, ers.flow, fishing_gear_codes.flow, fleet_segments.flow, infractions.flow, init_species_groups.flow,
from datetime import datetime, timedelta import pendulum import prefect from prefect import task, Flow from prefect.schedules import CronSchedule import pandas as pd from io import BytesIO import zipfile import requests import sqlalchemy import pyodbc schedule = CronSchedule(cron="*/10 * * * *", start_date=pendulum.datetime(2020, 12, 1, 13, 45, tz='America/Sao_Paulo')) @task def get_raw_data(): url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip", filebytes = BytesIO(requests.get(url).content) #extrair conteudo do zip myzip = zipfile.ZipeFile(filebytes) myzip.extractall() path = './microdados_enade_2019/2019/3.DADOS/' return path
from datetime import datetime, timedelta import pendulum import prefect from prefect import task, Flow from prefect.schedules import CronSchedule import pandas as pd from io import BytesIO import zipfile import requests schedule = CronSchedule(cron="*/30 * * * *", start_date=pendulum.datetime(2021, 3, 12, 17, 00, tz='America/Sao_Paulo')) @task def get_raw_data(): url = 'http://download.inep.gov.br/microdados/microdados_enem_2019.zip' filebytes = BytesIO(requests.get(url).content) logger = prefect.context.get('logger') logger.info('Dados obtidos') # Extrair o conteudo do zipfile myzip = zipfile.ZipFile(filebytes) myzip.extractall() path = './DADOS/'
from prefect.schedules import CronSchedule @task def extract(): return [1, 2, 3, 50] @task def transform(x): return [i * 10 for i in x] @task def load(y): print("Received y: {}".format(y)) with Flow("ETL") as flow: e = extract() t = transform(e) l = load(t) schedule = CronSchedule("0 0 * * *") # setup a cron scheduler flow_state = flow.run() # set the flow run to an object to track state flow.visualize( flow_state=flow_state) # visualize how the data moves throughout the DAG #%%
from datetime import datetime, timedelta import pendulum import prefect from prefect import task, Flow from prefect.schedules import CronSchedule import pandas as pd from io import BytesIO import zipfile import requests import mysql import pymysql import sqlalchemy from sqlalchemy import create_engine schedule = CronSchedule( cron="*/10 * * * * ", # *minutos *horas *dia *mês *dia semana start_date=pendulum.datetime(2020, 11, 26, 14, 25, tz='America/Sao_Paulo')) @task def get_raw_data(): # Atribuindo o link a um objeto 'url' url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip' # Faz o download do conteúdo filebytes = BytesIO(requests.get(url).content) # Extrair o conteúdo do 'zipfile' myzip = zipfile.ZipFile(filebytes) myzip.extractall() path = './microdados_enade_2019/2019/3.DADOS/'
repo="PrefectHQ/prefect", info_keys=["stargazers_count", "subscribers_count"], max_retries=1, retry_delay=datetime.timedelta(minutes=1), ) @task def process_stats(stats): data = { "Stars": stats["stargazers_count"], "Watchers": stats["subscribers_count"], "Date": pendulum.now("utc").isoformat(), } return data airtable = WriteAirtableRow( base_key="XXXXXXX", table_name="Stars", max_retries=1, retry_delay=datetime.timedelta(minutes=1), ) daily_schedule = CronSchedule("0 8 * * *") with Flow("Collect Repo Stats", schedule=daily_schedule) as flow: data = process_stats(repo_stats) final = airtable(data) flow.run()
'labels': ['ecs-agent', 'ooi', 'prod'], 'run_task_kwargs': { 'cluster': 'prefectECSCluster', 'launchType': 'FARGATE', }, } project_name = "ooi-harvest" data_org = "ooi-data" config_json = yaml.safe_load(CONFIG_PATH.open()) flow_run_name = "-".join([ config_json['instrument'], config_json['stream']['method'], config_json['stream']['name'], ]) schedule = CronSchedule(config_json['workflow_config']['schedule']) run_config = ECSRun(**RUN_OPTIONS) parent_run_opts = dict(**copy.deepcopy(RUN_OPTIONS)) parent_run_opts.update({'cpu': '0.5 vcpu', 'memory': '2 GB'}) parent_run_config = ECSRun(**parent_run_opts) with Flow(flow_run_name, schedule=schedule, run_config=parent_run_config) as parent_flow: flow_run = create_flow_run( flow_name="stream_harvest", run_name=flow_run_name, project_name=project_name, parameters={ 'config': config_json, 'error_test': False,
from prefect import Client from prefect.schedules import CronSchedule from reddit_daily import flow c = Client() s = CronSchedule("0 * * * *") flow.schedule = s flow.deploy(project="Dylan's Project")
from datetime import datetime, timedelta import pendulum import prefect from prefect import task, Flow from prefect.schedules import CronSchedule import pandas as pd from io import BytesIO import zipfile import requests schedule = CronSchedule(cron='*/10 * * * *', start_date=pendulum.datetime(2020, 12, 5, 14, tz="America/Sao_Paulo")) @task def get_raw_date(): url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip' filebytes = BytesIO(requests.get(url).content) zipped = zipfile.ZipFile(filebytes) zipped.extractall() return './microdados_enade_2019/2019/3.DADOS/' @task def apply_filters(path): interested_cols = [
import sys import prefect from prefect import task, Flow, Parameter from prefect.schedules import CronSchedule sys.path.append('../pyoilfundy') from pyoilfundy import fundyproducts as p daily_7_sched = CronSchedule('0 7 * * 1-5') def register_products_dash_flow(): with Flow('product_by_region', schedule=daily_7_sched) as f: commods = ['lpg', 'naphtha', 'gasoline', 'diesel', 'jet', 'fueloil'] p.make_specified_product_dash.map(commods) f.register(project_name='pyoilfundy')