def main(): schedule = IntervalSchedule(interval=timedelta(minutes=1)) with prefect.Flow("whats-up", schedule) as flow: area = fetch_current_area() aircraft = fetch_above_aircraft(area=area) update_display(ac_vectors=aircraft) # run locally # flow.run() # deploy to cloud flow_id = flow.deploy("test")
def main(): # ~----------------- SETTINGS -----------------~ with open(r'../settings.yml') as file: settings = yaml.full_load(file) PROJECT_DIR = '../' RAW_DATA_DIR = settings['project']['raw_data_dir'] FINAL_DATA_DIR = settings['project']['final_data_dir'] FILENAME = f"{settings['project']['export_filename']}_{datetime.now().strftime('%Y%m%d_%H%M')}.csv" FILEPATH_RAW = os.path.join(PROJECT_DIR, RAW_DATA_DIR, FILENAME) FILEPATH_ANALYSIS = os.path.join(PROJECT_DIR, FINAL_DATA_DIR, FILENAME) KEYWORDS = settings['query']['keywords'] USER_AGENT = settings['query']['user_agent'] GOOGLE_URL = settings['query']['google_url'] # ~----------------- FLOW -----------------~ # ~-- daily schedule schedule = IntervalSchedule( start_date=datetime.strptime("20210424-030500UTC", "%Y%m%d-%H%M%S%Z"), interval=timedelta(days=1), ) with Flow("etl", schedule=schedule) as flow: # parameter filepath_raw = Parameter(name="filepath_raw") filepath_analysis = Parameter(name="filepath_analysis") keywords = Parameter(name="keywords") user_agent = Parameter(name="user_agent") google_url = Parameter(name="google_url") # task flow # -- raw data search_urls = create_search_url(keywords, google_url) results_count = extract_result_count(search_urls, user_agent, google_url) df = df_build_results_count(keywords, results_count, search_urls) assert_df(df, keywords, google_url) export_raw_data(df, filepath_raw) # -- analysis data and plot df = transform_raw_data() export_analysis_data(df, filepath_analysis) deploy_plots(df) # ~----------------- RUN -----------------~ flow.run(filepath_raw=FILEPATH_RAW, filepath_analysis=FILEPATH_ANALYSIS, keywords=KEYWORDS, user_agent=USER_AGENT, google_url=GOOGLE_URL)
def build_update_playlist_flow(name): """ A flow to update the weekly top 25 playlist. This flow should run preiodically, at minimum once a day. """ fb = FireManager() sp = Spotify() now = datetime.now(pytz.UTC) schedule = IntervalSchedule( start_date=datetime(now.year, now.month, now.day, 7, 0, 0, 0, pytz.UTC), interval=timedelta(hours=24), ) with Flow(name, schedule=None) as flow: df = get_prev_week_tracks_task(fb) df = trim_to_week_task(df) update_prev_week_task(fb, df, stage=False) top = get_top_tracks_task(df, k=25) set_week_playlist_task(sp, top) return flow
def main(): schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(minutes=1), ) with Flow("etl", schedule=schedule) as flow: airport = Parameter("airport", default="IAD") radius = Parameter("radius", default=200) reference_data = extract_reference_data() live_data = extract_live_data(airport, radius, reference_data) transformed_live_data = transform(live_data, reference_data) load_reference_data(reference_data) load_live_data(transformed_live_data) flow.run()
def main(): schedule = IntervalSchedule( start_date=datetime(2020, 11, 23), # start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=1)) with Flow("live-purpleair", schedule=schedule) as flow: environment = Parameter("environment", default="prod") offline = Parameter("offline", default=False) purpleair_client = create_purpleair_client(offline) all_sensors_raw = extract_live_purpleair(purpleair_client) all_sensors_processed = transform_all_sensors_raw(all_sensors_raw) blob_client = create_hour_blob_client(environment) load_all_sensors_raw_json(all_sensors_raw, blob_client) load_all_sensors_processed(all_sensors_processed, blob_client) # Registers flow to server, which we can then deploy and run in background agents. flow.register(project_name="caqi-flows")
def configure(self): """ Load the configuration file and populate the required arrays. """ with open(self.args.config, "r") as ymlfile: self.config = yaml.safe_load(ymlfile) # Configure the Prefect task scheduler self.schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta( seconds=self.config["program"]["time_interval"] ), ) self.centers, self.stds, self.jumps = utils.load_generation_params(self.config) self.generator = tasks.Generator(self.centers, self.stds, self.jumps, name="generator") self.writer = tasks.MonitorCSVWriter(self.config["program"]["monitor_file"], name="writer")
connection_string = 'postgresql://*****:*****@localhost:5432/kvb' engine = create_engine(connection_string) print(f'load data into {table} table...') for row in data: query = f''' INSERT INTO {table} (point_in_time, name, lng, lat, numbers) VALUES ('{row['point_in_time']}', '{row['name']}', {row['lng']}, {row['lat']}, '{row['numbers']}') ''' engine.execute(query) schedule = IntervalSchedule(start_date=datetime.now( pytz.timezone('Europe/Berlin')), interval=timedelta(minutes=1)) def main(): with Flow('kvb-bikes', schedule=schedule) as flow: live_kvb_data = get_live_data() live_data_from_stations = extract_stations(live_kvb_data) live_data_from_bikes = extract_bikes(live_kvb_data) load_data(live_data_from_stations, 'prefect_stations') load_data(live_data_from_bikes, 'prefect_bikes') flow.run()
"Meta Data": meta_data, "Time Series (15min)": data }) @task def persist_data_in_influx(injector: Injector, av_response: InterdayResponseModel, secrets: Dict[str, str]): influx_v2_client = injector.get(InfluxDBClient) influx_v2_client.write_api(SYNCHRONOUS).write( secrets['INFLUX_V2_BUCKET'], record=interday_response_model_to_points(av_response)) schedule = IntervalSchedule(interval=timedelta(hours=24)) with Flow("scrap-stock", schedule) as flow: injector = create_secret_injector_task() token_renewal_result = renew_token_task(injector) secrets = fetch_secret_task('common', 'kv', injector) stocks = Parameter("stocks", default=["GOOGL", "MSFT"]) av_response = scrap_stock.map(stocks, secrets=unmapped(secrets)) persist_data_in_influx.map(injector=unmapped(injector), av_response=av_response, secrets=unmapped(secrets)) flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py") flow.run()
## note that the relevant file will actually be called "known_location.pdf" flow.visualize(flow_state=new_state, filename="known_location") return new_state @task def generate_random_list(): n = random.randint(15, 25) return list(range(n)) @task(max_retries=3, retry_delay=timedelta(seconds=0)) def randomly_fail(): x = random.random() if x > 0.7: raise ValueError("x is too large") schedule = IntervalSchedule( start_date=datetime.utcnow(), interval=timedelta(minutes=1), end_date=datetime.utcnow() + timedelta(minutes=10), ) with Flow("random-mapping-with-viz", schedule=schedule, state_handlers=[visualize]) as f: randomly_fail.map(upstream_tasks=[generate_random_list]) f.run()
import os import pandas as pd import sqlalchemy import sqlite3 import prefect from prefect import task, Flow from prefect.schedules import IntervalSchedule from datetime import timedelta from dotenv import load_dotenv load_dotenv(os.path.join('.', '.env')) DELAY = timedelta(minutes=1) RUNNER_KEY = os.getenv('AGENT_TOKEN') scheduler = IntervalSchedule(interval=DELAY) @task def get_data() -> pd.DataFrame: df = pd.read_csv( 'https://raw.githubusercontent.com/A3Data/hermione/master/hermione/file_text/train.csv' ) return df @task def calculate_mean_age(df: pd.DataFrame) -> float: return df.Age.mean()
return result["hits"]["hits"] @task def send_email(intervention_body): to_email, from_email = "*****@*****.**", "*****@*****.**" template = env.get_template('email.html') intervention_url = f"http://localhost:4200/detail-intervention/154515" body_content = template.render(intervention_url=intervention_url, what_happens_label="WhatsHappen") message = MIMEMultipart() message.attach(MIMEText(body_content, "html")) message['Subject'] = intervention_body["_source"]["primaryAlert"][ "alertId"] message['From'] = from_email message['To'] = to_email msg_body = message.as_string() s = smtplib.SMTP('localhost:1025') s.sendmail(from_email, to_email, msg_body) s.quit() with Flow("Send emails on new Interventions", IntervalSchedule(interval=INTERVAL)) as flow: interventions = fetch_new_interventions() send_email.map(interventions) flow.run()
data = pd.concat(data, ignore_index=True, sort=False) if len(data) == 0: raise ENDRUN(state=Skipped()) return data @task(log_stdout=True, skip_on_upstream_skip=True) def sample(tweets): print('responses shape', tweets.shape) print(tweets.columns) print(tweets.sample(5)) schedule = IntervalSchedule( # start_date=datetime(2020, 1, 20), # interval=timedelta(hours=1), start_date=datetime.now() + timedelta(seconds=1), interval=timedelta(hours=1), ) # with Flow("Rehydration Pipeline", schedule=schedule) as flow: with Flow("Rehydration Pipeline") as flow: creds = load_creds() path_list = load_path() tweets = load_tweets(creds, path_list) tweets = clean_timeline_tweets(tweets) tweets = clean_datetimes(tweets) tweets = clean_retweeted(tweets) tweets = tag_status_type(tweets) tweets = flatten_retweets(tweets) tweets = flatten_quotes(tweets) tweets = flatten_users(tweets)
LIMIT 100000 """ df = ph.read_clickhouse(query, connection=connection) return df @prefect.task(max_retries=5, retry_delay=timedelta(seconds=2)) def agregate(df): agg = df.groupby(['SalesDate', 'UserID'], as_index=False).sum() return agg # schedule to run every 12 hours schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(seconds=1), # interval=timedelta(hours=12)) interval=timedelta(minutes=5), end_date=datetime.utcnow() + timedelta(minutes=10)) with prefect.Flow( name="SQL", schedule=schedule, # state_handlers=[handler], ) as flow: dataframes = download(connection) fin = agregate(dataframes) # client = Client() client.create_project(project_name='SQL') flow.register(project_name='SQL')
def notify_on_retry(task: Task, new_state: State): logger: Logger = prefect.context.get('logger') logger.warning( f'Task {task.name}/{task.slug} is retrying at {new_state.start_time}', ) log_on_retry = callback_factory(notify_on_retry, lambda s: s.is_retrying()) name = Parameter('name', default='potato') threshold = Parameter('threshold', default=8) schedule = IntervalSchedule( start_date=datetime.fromisoformat('2021-01-01'), interval=timedelta(minutes=2), ) with Flow( name='My first flow!', schedule=schedule, validate=True, executor=LocalDaskExecutor(), ) as flow: hello_task = say_hello(person=name) random = RandomTask( name='Random', max_retries=3, retry_delay=timedelta(seconds=1), ) random_1 = random(threshold=threshold)
controllers, controls, current_segments, ers, fishing_gear_codes, fleet_segments, infractions, init_species_groups, last_positions, ports, species, vessels, ) ################################ Define flow schedules ################################ control_anteriority.flow.schedule = IntervalSchedule(interval=timedelta( hours=1)) current_segments.flow.schedule = IntervalSchedule(interval=timedelta( minutes=10)) ers.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1)) fishing_gear_codes.flow.schedule = CronSchedule("0 3 * * *") last_positions.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1)) species.flow.schedule = CronSchedule("0 4 * * *") vessels.flow.schedule = CronSchedule("0 2 * * *") ###################### List flows to register with prefect server ##################### flows_to_register = [ controllers.flow, controls.flow, control_anteriority.flow, current_segments.flow, ers.flow,
from prefect import task, Flow from datetime import timedelta from prefect.schedules import IntervalSchedule import pendulum @task def say_hello(): print("Hello, world!") schedule = IntervalSchedule(interval=timedelta(days=1), start_date=pendulum.datetime(2010, 1, 1)) with Flow("interval-schedule", schedule) as flow: say_hello() flow.run(run_on_schedule=True) # flow.register(project_name="Demo", version_group_id="custom_int") pd = pendulum.datetime(2010, 1, 1) pd.add(days=1)
URL_CARD, ) from autoscrape_data import ( card_scraping, name_scraping, ) from autoprocess_data import process_card_data # %% # ------------------------------------------------------- # Pipeline scehduler # ------------------------------------------------------- schedule = IntervalSchedule(interval=dt.timedelta(days=30)) # %% # ------------------------------------------------------- # Build pipeline # ------------------------------------------------------- with Flow(name='malaysia_bank_card_scraping_flow', result=LocalResult(dir="result_config")) as flow: # Step 1: Compile a list of bank names for credit cards. ls_banks_for_card = name_scraping.compile_bank_names_for_card(URL_CARD, '''/html/body/main/section/form/label/select''') # Step 2: Compile a list of credit cards for each bank. dict_cards = name_scraping.compile_credit_cards(
base="master", head="dev", title="Bi-weekly Release", max_retries=1, retry_delay=datetime.timedelta(minutes=1), ) @task(trigger=any_failed) def prepare_exception(exc): return repr(exc) issue_task = OpenGitHubIssue( name="Open Release Issue", repo="PrefectHQ/cloud", title="Release Cycle is Broken", labels=["release", "bug"], ) biweekly_schedule = IntervalSchedule(start_date=pendulum.parse("2019-03-18"), interval=datetime.timedelta(days=14)) with Flow("Biweekly Cloud Release", schedule=biweekly_schedule) as flow: exc = prepare_exception( pr_task) # will only run if pr_task fails in some way issue = issue_task(body=exc) flow.set_reference_tasks([pr_task]) flow.run()
"board-name": board_name }) meaningful_table.append(table_row) return meaningful_table @task def saveThreadsToSqlite(board_tabel: List[ThreadData]) -> None: sqlite_cursor = sqlite_connection.cursor() for board_thread in board_tabel: sqlite_cursor.execute(board_thread.toSqliteInsert()) sqlite_connection.commit() sqlite_cursor.close() schedule = IntervalSchedule(interval=datetime.timedelta(minutes=30)) with Flow("get-chan-threads") as the_flow: for chan_name in CHAN_NAME: c = getChan(chan_name) r = parseTable(c) r = extract(r, chan_name) # download service down here # downloadRelevantThread(bt, chan_name) saveThreadsToSqlite(r) the_flow.run() sqlite_connection.close()
save_to_neo=True, writers={}) try: search = search_term job_name = "qanon" limit = 10000000 for df in fh.search_time_range( tp=tp, Search=search, Since=datetime.strftime(start, "%Y-%m-%d %H:%M:%S"), Until=datetime.strftime(current, "%Y-%m-%d %H:%M:%S"), job_name=job_name, Limit=10000000, stride_sec=30): logger.info('got: %s', len(df) if not (df is None) else 'None') logger.info('proceed to next df') except Exception as e: logger.error("job exception", exc_info=True) raise e logger.info("job finished") schedule = IntervalSchedule(interval=timedelta(seconds=30), ) storage = S3(bucket=S3_BUCKET) #with Flow("covid-19 stream-single") as flow: #with Flow("covid-19 stream", storage=storage, schedule=schedule) as flow: with Flow("qanon stream", schedule=schedule) as flow: run_stream() flow.run()
from datetime import timedelta, datetime from flask import Flask import prefect from prefect.schedules import IntervalSchedule from prefect import task, Flow from db.database import init_db from env_vars import prefect_project_name from ingest.nvd_cve_data_ingest import ensure_cve_modified_feed_is_updated schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(hours=12), interval=timedelta(hours=12), ) @task def update_task(): app = Flask(__name__) init_db(app) with app.app_context(): logger = prefect.context.get("logger") logger.info("Checking for updates and writing any new records.") ensure_cve_modified_feed_is_updated() def register_update_flow(): flow = Flow("update_task", tasks=[update_task]) flow.register(project_name=prefect_project_name)
@task(cache_for=datetime.timedelta(minutes=1, seconds=30)) def return_random_number(): return random.random() @task def print_number(num): print("=" * 50) print("Value: {}".format(num)) print("=" * 50) schedule = IntervalSchedule( start_date=datetime.datetime.utcnow(), interval=datetime.timedelta(minutes=1) ) with Flow("cached-task", schedule=schedule) as flow: result = print_number(return_random_number) flow.run() # ================================================== # Value: 0.8246312081499598 # ================================================== # ================================================== # Value: 0.8246312081499598 # ================================================== # ==================================================
from prefect import task, Flow from prefect.schedules import IntervalSchedule import papermill as pm import datetime import os schedule = IntervalSchedule(start_date=datetime.datetime.utcnow() + datetime.timedelta(seconds=5), interval=datetime.timedelta(days=1)) @task(max_retries=3, retry_delay=datetime.timedelta(seconds=10)) def fetch_data(): pm.execute_notebook("./01_fetch_data.ipynb", "./01_fetch_data_%s.ipynb" % datetime.datetime.now().strftime("%Y-%m-%d"), parameters={ "BTC_TICKER": "BTC-USD", "NB_DAYS": 0, "DATE_FORMAT": "%Y-%m-%d", "DATA_DIR": "data", "OUTPUT_DIRNAME": "01_raw", "OUTPUT_FILENAME": "raw_data.csv", "EXECUTION_DATE": datetime.datetime.now().strftime("%Y-%m-%d")
return date + timedelta(days=7) ## templated command; template vars will be read from both prefect.context as well as ## any passed kwargs to the task command = """ {% for i in range(5) %} echo "{{ scheduled_start_time }}" echo "{{ scheduled_start_time_7 }}" echo "{{ my_param }}" {% endfor %} """ templated_command = JinjaTemplate( template=command, max_retries=1, retry_delay=retry_delay ) ## create schedule for the Flow schedule = IntervalSchedule(start_date=datetime(2015, 6, 1), interval=timedelta(days=1)) ## create Flow and specify dependencies using functional API ## we don't actually attach the schedule to this Flow so it only runs once with Flow("tutorial") as flow: my_param = Parameter("my_param") t2(upstream_tasks=[t1]) t3 = templated_command( scheduled_start_time_7=add_7, my_param=my_param, upstream_tasks=[t1] ) flow.run(parameters={"my_param": "Parameter I passed in"})
from smol import db_models from smol.database import engine, SessionLocal from smol.extract import find_page_count, get_tables, get_li_tags, get_all_postings from smol.load import load_all_records from smol.transform import get_companies from smol.gsheets_utils import clean_sheet, upload_lines from smol.sns_utils import send_success_message, send_failure_message db_models.Base.metadata.create_all(bind=engine) db: Session = SessionLocal() schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(hours=24) ) @task( name="extract", max_retries=3, retry_delay=timedelta(seconds=10), on_failure=send_failure_message ) def extract(): pg_count = find_page_count() tables = get_tables(pg_count) li_tags = get_li_tags(tables) return get_all_postings(li_tags)
import prefect from prefect import Flow, task from prefect.schedules import IntervalSchedule from datetime import timedelta, datetime import time schedule = IntervalSchedule( start_date=datetime.utcnow() + timedelta(seconds=1), interval=timedelta(minutes=5), ) @task def run(): logger = prefect.context.get("logger") results = [] for x in range(3): results.append(str(x + 1)) logger.info("Hello Anaconda Enterprise! run {}".format(x + 1)) time.sleep(3) return results with Flow("Hello Anaconda Enterprise", schedule=schedule) as flow: results = run() flow.register(project_name="Hello Anaconda Enterprise")
print("Root running...") time.sleep(5) print("Root complete.") return class Node1_1(Task): def run(self): print("Node 1_1 running...") time.sleep(5) print("Node 1_1 complete.") return class Node1_2(Task): def run(self): print("Node 1_2 running...") time.sleep(5) print("Node 1_2 complete.") return schedule = IntervalSchedule(interval=timedelta(minutes=5)) with Flow("Simple Dependencies", schedule=schedule) as flow: root = Root() node1_1 = Node1_1() node1_2 = Node1_2() node1_1(upstream_tasks=[root]) node1_2(upstream_tasks=[root, node1_1])
return 1 @task(name="getTemp", slug="getTemp") def getTemp(lat, long, apiK): apiK2 = ''.join(apiK) forecast = "https://api.darksky.net/forecast/{}/{},{}".format( apiK2, lat, long) data = requests.get(url=forecast) json_response = data.json() u = json_response["hourly"]["data"] temp = u[0]["temperature"] return temp simpleSchedule = IntervalSchedule(interval=timedelta(minutes=120)) with Flow("TempAC", schedule=simpleSchedule) as flow: maxtemp = Parameter("maxtemp", default=90) local_temp = getTemp(40.7135, -73.9859, DarkSkiesKey) target_state = targetACState(local_temp, maxtemp) local_token = getKasaToken(kUser, kSecret) local_device_list = getKasaDeviceList(local_token) # Basically, we only have one device.. this_device_id = local_device_list[0]["deviceId"] # Finally, modify target state. # Perhaps should check if different first. modifyKasaDeviceState(local_token, this_device_id, target_state) # flow.run() flow.register(project_name="Jenny")