def run_sigla_pipeline(master_spreadsheet_id: str, google_api_credentials_path: str, db_connection_url: str): """ Run the SIGLA ETL pipeline Parameters ---------- google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. db_connection_url: str The DB's connection url str. """ # Delete all documents from db _clean_up(db_connection_url) # Create a connection to the google sheets reader google_sheets_institution_extracter = GoogleSheetsInstitutionExtracter( google_api_credentials_path) # Get the list of spreadsheets ids from the master spreadsheet spreadsheets_id = google_sheets_institution_extracter.get_spreadsheets_id( master_spreadsheet_id) log.info("Finished pipeline set up, start running pipeline") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Extract and transform") as flow: # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheets_id, unmapped(google_api_credentials_path), ) # Flatten the list of list of SheetData flattened_spreadsheets_data = _flatten_list(spreadsheets_data) # Transform list of SheetData into FormattedSheetData _transform.map(flattened_spreadsheets_data) # Run the extract and transform flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) formatted_sheets_data = state.result[flow.get_tasks( name="_transform")[0]].result log.info("=" * 80) # Partion into institutions and non-institutions sheets institutions = _filter_formatted_sheet_data( formatted_sheets_data, [ gs_format.standard_institution, # gs_format.institution_by_rows, gs_format.multiple_sigla_answer_variable, ], ) non_institutions = _filter_formatted_sheet_data( formatted_sheets_data, [ gs_format.institution_and_composite_variable, gs_format.composite_variable ], ) # Run load institutions flow with Flow("Load institutions") as load_institutions_flow: _load.map(institutions, unmapped(db_connection_url)) load_institutions_flow.run( executor=DaskExecutor(cluster.scheduler_address)) log.info("=" * 80) # Run load non institutions flow with Flow("Load non institutions") as load_non_institutions_flow: _load.map(non_institutions, unmapped(db_connection_url)) load_non_institutions_flow.run( executor=DaskExecutor(cluster.scheduler_address))
def test_no_raise_on_normal_flow(self): flow = Flow("THIS IS A TEST") assert healthchecks.environment_dependency_check([flow]) is None
from prefect import Flow, task # latest. @task def extract(): """Get a list of data""" return [1, 2, 3] @task def transform(data): """Multiply the input by 10""" return [i * 10 for i in data] @task def load(data): """Print the data to indicate it was received""" print("Here's your data: {}".format(data)) with Flow("sample_workflow") as flow: e = extract() t = transform(e) l = load(t)
# This would equivalent to: # covid-world-scraper --cache-dir=$PWD/covid-cache bra print(f"Processing {country}") runner.run(cache_dir=cache_dir, headless_status=True, filter=[country]) @task(name="list_countries") def list_countries(): """Return list of all countries available, which will be mapped to the process data function (to run in parallel if possible). """ runner = Runner(alert_manager=None) return [x.split(' ')[0] for x in runner.list_countries()] with Flow("dask-example") as flow: cache_dir = create_cachedir() countries = list_countries() process_data.map(country=countries, cache_dir=unmapped(cache_dir)) # Run and visualize the flow! flow.register(project_name="Covid-Data-Scraper") flow_state = flow.run(executor=executor) # GraphViz is required for this visualization try: flow.visualize(flow_state=flow_state) except Exception as err: print(err)
def run_qa_test( db_connection_url: str, google_api_credentials_path: str, master_spreadsheet_id: Optional[str] = None, spreadsheet_ids_str: Optional[str] = None, ): """ Run QA test Parameters ---------- master_spreadsheet_id: str The master spreadsheet id. db_connection_url: str The DB's connection url str. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. spreadsheet_ids_str: Optional[str] = None The list of spreadsheet ids. """ cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("QA Test") as flow: # get a list of spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str) # list of list of db institutions db_institutions_data = _gather_db_institutions.map( spreadsheet_ids, unmapped(db_connection_url)) # db institutions with their db variables and composite variable data db_institutions = _gather_db_variables.map( flatten(db_institutions_data), unmapped(db_connection_url)) # group db institutions db_institutions_group = _group_db_institutions(db_institutions) # extract list of list of sheet data spreadsheets_data = _extract.map(spreadsheet_ids, unmapped(google_api_credentials_path)) # transform to list of formatted sheet data formatted_spreadsheets_data = _transform.map( flatten(spreadsheets_data)) # create institutional filter gs_institution_filter = _create_filter_task([ GoogleSheetsFormat.standard_institution, GoogleSheetsFormat.multiple_sigla_answer_variable, GoogleSheetsFormat.institution_and_composite_variable, ]) # filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter( formatted_spreadsheets_data) # get list of list of gs institution gs_institutions = _gather_gs_institutions.map(gs_institutions_data) # create composite filter gs_composite_filter = _create_filter_task([ GoogleSheetsFormat.composite_variable, GoogleSheetsFormat.institution_and_composite_variable, ]) # filter to list of composite formatted sheet data gs_composites = gs_composite_filter(formatted_spreadsheets_data) # group gs institutions gs_institutions_group = _group_gs_institutions( flatten(gs_institutions)) # compare gs institutions against db # get list of comparisons gs_institution_comparisons = _compare_gs_institution.map( flatten(gs_institutions), unmapped(db_institutions_group)) # compare gs composite variables against db institutions # get list of list of comparisons gs_composite_comparisons = _compare_gs_composite_variable.map( gs_composites, unmapped(db_connection_url)) # write gs institution comparisons _write_comparison.map(gs_institution_comparisons) # write gs composite comparisons _write_comparison.map(flatten(gs_composite_comparisons)) # write extra db institution _write_extra_db_institutions(db_institutions, gs_institutions_group) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # get write comparison tasks _write_comparison_tasks = flow.get_tasks(name="_write_comparison") # get the comparisons comparisons = [ *state.result[_write_comparison_tasks[0]].result, *state.result[_write_comparison_tasks[1]].result, ] # filter to error comparisons gs_error_comparisons = [ comparison for comparison in comparisons if comparison.has_error() ] # get extra db institution filename extra_db_institutions_filename = state.result[flow.get_tasks( name="_write_extra_db_institutions")[0]].result # write zip file with ZipFile("qa-test.zip", "w") as zip_file: for comp in gs_error_comparisons: zip_file.write( comp.get_filename(), f"{comp.spreadsheet_title}/{comp.sheet_title},{comp.name}", ) if extra_db_institutions_filename: zip_file.write(extra_db_institutions_filename, "extra-institutions.csv")
retry_delay=timedelta(minutes=1), nout=2, trigger=triggers.all_finished, ) def create_parquet(_success): ts = prefect.context.scheduled_start_time dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H") vintage_fn = FN_STR.format(dt_str) + ".parquet" fn = FN_STR.format("") + ".parquet" df = pd.read_csv(CSV_FN, parse_dates=["dt"]) df.to_parquet(DATA_PATH / vintage_fn, index=False) df.to_parquet(DATA_PATH / fn, index=False) return vintage_fn, fn @task def get_gcs_cmd(fn): return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}" shell = ShellTask() with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f: connstr = EnvVarSecret("COVID_DB_CONN_URI") success = export_to_csv(connstr) vintage_fn, fn = create_parquet(success) shell(get_gcs_cmd(vintage_fn)) shell(get_gcs_cmd(fn)) f.register(project_name="can-scrape")
] top_name = 'SramMinionRTL' clk_period = 2.0 #ns #------------------------------------------------------------------------- # Instantiate step instances #------------------------------------------------------------------------- block = Block( files, top_name, clk_period=clk_period, name='block' ) getadk = ADK( adk_config, name='adk' ) vcd2saif = Vcd2Saif( name='vcd2saif' ) synth = Synthesis( name='synth' ) pnr = PlaceAndRoute( name='pnr', aspect_ratio=0.6, core_density=0.65 ) pwr = PowerAnalysis( name='pwr' ) summary = Summary( name='summary' ) memgen = MemGen( name='memgen', inst_name='SRAM_32x128_1rw', word_size=32, num_words=128 ) #------------------------------------------------------------------------- # Instantiate the flow #------------------------------------------------------------------------- with Flow( 'ece5745-tut8-sram' ) as flow: sram = memgen() adk = getadk() rtl = block() saif = vcd2saif( vcd=rtl ) netlist = synth( adk=adk, verilog=rtl, saif=saif, macro=sram ) pnr_res = pnr( adk=adk, netlist=netlist, macro=sram ) pwr_rpt = pwr( adk=adk, saif=saif, namemap=netlist, verilog=pnr_res ) summary( post_pnr_reports=pnr_res, power_reports=pwr_rpt, saif=saif )
# create task templates upload_template = GCSUpload(bucket=BUCKET, blob=BLOB, credentials_secret="GCS_CREDS") download_template = GCSDownload(bucket=BUCKET, blob=BLOB, credentials_secret="GCS_CREDS") copy_template = GCSCopy( source_bucket=BUCKET, source_blob=BLOB, dest_bucket=BUCKET, credentials_secret="GCS_CREDS", ) with Flow("GCS Example") as flow: # upload with default settings upl = upload_template(data=DATA) dwl = download_template(upstream_tasks=[upl]) # upload to new blob and download it upl_new = upload_template(data=DATA, blob="another/blob") dwl_new = download_template(blob=upl_new) # copy the default blob twice cp_1 = copy_template(dest_blob="yet/another/blob", upstream_tasks=[upl]) cp_2 = copy_template(source_blob=cp_1, dest_blob="one/last/blob") # download the new blob dwl_new = download_template(blob=cp_2)
def test_shell_task_raises_fail_if_cmd_fails(): with Flow(name="test") as f: task = ShellTask()(command="ls surely_a_dir_that_doesnt_exist") out = f.run() assert out.is_failed() assert "Command failed with exit code" in str(out.result[task].message)
def test_nested_collections(self, val): with Flow("test") as f: task = tasks.as_task(val) f.add_task(task) assert f.run().result[task].result == val
from prefect import Task, Flow, task # @task # def do_something(): # print("something") class DoSomething(Task): def run(self): return "asdf" class PrintVal(Task): def run(self, val): print(val) do_something = DoSomething() print_val = PrintVal() flow = Flow("demo") print_val.set_upstream(do_something, key="val", flow=flow) flow.run()
def test_nested_collections_of_mixed_constants_are_not_constants( self, val): with Flow("test"): task = tasks.as_task(val) assert not isinstance(task, Constant)
def test_as_task_toggles_constants(self): with Flow("test"): t = tasks.as_task(4) assert isinstance(t, Task) assert t.name == "4"
def test_tasks_have_all_non_unmapped_constant_args_as_transitive_upstream_deps( self, ): def func(a, b, c, d): m = inc.copy(name="m").bind(1) n = inc.copy(name="n").bind(a) o = inc.copy(name="o").bind(b) p = add.copy(name="p").bind(n, o) q = add.copy(name="q").bind(c, d) r = add.copy(name="r").bind(q, m) return m, n, o, p, q, r with Flow("test") as flow: a = ranged.copy(name="a").bind(3) b = inc.copy(name="b").bind(1) c = Constant(1, name="c") d = Constant(range(3), name="d") m, n, o, p, q, r = apply_map(func, a, edges.unmapped(b), c=edges.unmapped(c), d=d) def edge_info(task): """Returns a map of {upstream: (is_data_dep, is_mapped)}""" return { e.upstream_task: (e.key is not None, e.mapped) for e in flow.edges_to(task) } assert edge_info(m) == { a: (False, True), b: (False, False), d: (False, True) } assert edge_info(n) == { a: (True, True), b: (False, False), d: (False, True) } assert edge_info(o) == { a: (False, True), b: (True, False), d: (False, True) } assert edge_info(p) == {n: (True, True), o: (True, True)} assert edge_info(q) == { a: (False, True), b: (False, False), d: (True, True) } assert edge_info(r) == {q: (True, True), m: (True, True)} state = flow.run() res = {t: state.result[t].result for t in [m, n, o, p, q, r]} sol = { m: [2, 2, 2], n: [1, 2, 3], o: [3, 3, 3], p: [4, 5, 6], q: [1, 2, 3], r: [3, 4, 5], } assert res == sol
import prefect from prefect import task, Flow # from prefect.executors import LocalDaskExecutor @task def print_contexts(): print("foo = " + prefect.context.foo) with Flow("test") as flow: t1 = print_contexts() t2 = print_contexts() t1.set_downstream(t2) # flow.executor = LocalDaskExecutor() with prefect.context(foo="foo") as ctx: flow.run(task_contexts=ctx)
def test_shell_initializes_with_basic_cmd(): with Flow(name="test") as f: task = ShellTask(command="echo -n 'hello world'")() out = f.run() assert out.is_successful() assert out.result[task].result == "hello world"
from prefect import task, Flow @task def say_hello(): print("Hello, world!") with Flow("My First Flow") as flow: say_hello() flow.run() # "Hello, world!"
def test_shell_initializes_and_multiline_output_returns_last_line(): with Flow(name="test") as f: task = ShellTask()(command="echo -n 'hello world\n42'") out = f.run() assert out.is_successful() assert out.result[task].result == "42"
# Configurable value taken at runtime length = Parameter(name="length", default=5, required=False) # ETL Pipeline Tasks @task(result=LocalResult(), target="{date:%A}/{task_name}.prefect") def extract(length): # Extract the data return sample(range(100), length) @task(max_retries=3, retry_delay=timedelta(seconds=5)) def transform(data): # Transform the data return data * 10 @task(trigger=some_successful(at_least=1, at_most=6)) def load(data): # Load the data print(f"\nHere's your data: {data}") # Define Tasks in a Flow Context with Flow('Evolving ETL', result=S3Result(bucket="flow-result-storage")) as flow: with case(length, 5): e = extract(length) with case(length, 50): e = extract(length) t = transform.map(e) l = load(t) flow.run(parameters={'length': 50}) # Prints data
def test_shell_returns_none_if_empty_output(): with Flow(name="test") as f: task = ShellTask()(command="ls > /dev/null") out = f.run() assert out.is_successful() assert out.result[task].result is None
from prefect import Task, Flow from prefect.environments.storage import GitHub with Flow("hUGe fLow") as flow: for i in range(2000): flow.add_task(Task(name=f"{i}")) flow.storage = GitHub( repo="znicholasbrown/project-schematics", path="flows/hUGe_fLow.py", secrets=["GITHUB_AUTH_TOKEN"], ) flow.register(project_name="Dev Straining")
def test_shell_initializes_and_multiline_output_optionally_returns_all_lines(): with Flow(name="test") as f: task = ShellTask(return_all=True)(command="echo -n 'hello world\n42'") out = f.run() assert out.is_successful() assert out.result[task].result == ["hello world", "42"]
def timestamper(task, old_state, new_state): """ Task state handler which timestamps new states and logs the duration between state changes using the task's logger. """ new_state.timestamp = pendulum.now("utc") if hasattr(old_state, "timestamp"): duration = (new_state.timestamp - old_state.timestamp).in_seconds() task.logger.info( "{} seconds passed in between state transitions".format(duration) ) return new_state @task(state_handlers=[timestamper]) def sleeper(): time.sleep(2) f = Flow("log-task-duration", tasks=[sleeper]) f.run() # INFO - prefect.FlowRunner | Beginning Flow run for 'log-task-duration' # INFO - prefect.FlowRunner | Starting flow run. # INFO - prefect.TaskRunner | Task 'sleeper': Starting task run... # INFO - prefect.Task | 2 seconds passed in between state transitions # INFO - prefect.TaskRunner | Task 'sleeper': finished task run for task with final state: 'Success' # INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
def test_shell_raises_if_no_command_provided(): with Flow(name="test") as f: ShellTask()() with pytest.raises(TypeError): with raise_on_exception(): assert f.run()
return response.text @task def load_file(filename: str) -> str: with open(filename, "r", encoding="utf-8") as file: return file.read() @task def printa(stuff): print(stuff) task = ShellTask(return_all=True) with Flow("shell") as f: translation_server_url = "http://localhost:1969" bibtex = load_file("./workspace/aksw-short.bib") zotero = import_translation(bibtex, translation_server_url) rdf = export_translation(zotero, translation_server_url, "rdf_bibliontology") turtle = task(command="rapper - -o turtle -I www.test.com > tests.ttl") printa(turtle) f.run_config = DockerRun(image="prefecthq/prefect") f.register(project_name="tutoriala") # Configure extra environment variables for this flow, # and set a custom image # f.run()
def test_shell_runs_other_shells(): with Flow(name="test") as f: task = ShellTask(shell="zsh")(command="echo -n $ZSH_NAME") out = f.run() assert out.is_successful() assert out.result[task].result == "zsh"
from prefect import Flow from prefect.backend import FlowView from prefect.run_configs import UniversalRun from prefect.storage import Local FLOW_DATA_1 = { "id": "id-1", "name": "name-1", "settings": { "key-1": "value-1" }, "run_config": UniversalRun(env={ "ENV-1": "VAL-1" }).serialize(), "serialized_flow": Flow("flow-1").serialize(), "archived": False, "project": { "name": "project-1" }, "flow_group": { "labels": ["label-1"] }, "core_version": "0.0.0", "storage": Local(stored_as_script=True, path="fake-path-1.py").serialize(), } FLOW_DATA_2 = { "id": "id-2", "name": "name-2", "settings": {
from prefect import task, Flow, Parameter def run(flow, **parameters): state = flow.run(**parameters) terminal_task = list(flow.terminal_tasks())[0] return state.result[terminal_task].result with Flow('Add one') as flow: result = Parameter('x') + 1 assert run(flow, x=1) == 2 assert run(flow, x=2) == 3 assert run(flow, x=-100) == -99 with Flow('Add x and y') as flow: result = Parameter('x') + Parameter('y') assert run(flow, x=1, y=1) == 2 assert run(flow, x=40, y=2) == 42
def flow(source: str, relpath: str, convert: bool = True, output_dir: str = '/tmp/otpev', version: str = '20.06', created: Optional[str] = None, n_partitions: Optional[int] = None) -> Flow: """Get OTP evidence import flow Parameters ---------- source : str OTP evidence source (e.g. eva, l2g, uniprot) relpath : str Path relative from `gs://open-targets-data-releases/$VERSION/input/evidence-files` to data file or directory (e.g. "progeny-2018-07-23.json.gz" or "evidences_protein_fix/chembl_dataset") output_dir : str Directory in which temporary json/parquet files are stored version : str OTP release version created: str, optional Date at which OTP version was created. This should NOT be a time at which data was collected -- it is intended to reflect when OT created the release and should never change for the same `version`. For this reason, `created` will default to known release dates (see `OT_VERSION_RELEASE_DATES`). n_partitions: int, optional Number of partitions used to write parquet result. Set as None to use default partitioning. Raises ------ KeyError If `created` is not provided and no known release date was previously recorded for the specified `version` Returns ------- Flow Prefect Flow """ version = str(version) if created is None: if version not in OT_VERSION_RELEASE_DATES: raise KeyError( f'No release date known for version "{version}" ' '(pass `created` explicitly or add date to `OT_VERSION_RELEASE_DATES`)' ) created = OT_VERSION_RELEASE_DATES[version] output_dir = Path(output_dir) / source if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) is_file = relpath.endswith("json.gz") src_url = OT_URL_FMT.format(version=version) + f'/{relpath.lstrip("/")}' entry = get_entry(source, version, created, format='parquet' if is_file else 'json.gz', type='file' if is_file else 'directory', properties=None if is_file else dict(compression='gzip')) catalog_path = catalog.default_urlpath() with Flow(f'otpev-{source}-v{version}') as flow: # Add constants with important to DAG (all others are not visualized) catalog_path = constant(catalog_path, name='catalog_path') dst_url = next(iter(entry.resources.values())) entry = constant(entry, name=f'entry.key={entry_key_str(entry.key)}', value=False) n_partitions = constant(n_partitions, name='n_partitions') if is_file: filename = src_url.split('/')[-1] src_url = constant(src_url, name='src_url') dst_url = constant(dst_url, name='dst_url') # Download and convert to parquet json_path = constant(str(output_dir / filename), name='json_path') parquet_path = constant(str(output_dir / filename.split('.')[0]) + '.parquet', name='parquet_path') json_path = download(src_url, json_path) info = convert_to_parquet(json_path, parquet_path, n_partitions=n_partitions) # Upload results # pylint:disable=unexpected-keyword-arg status = upload(entry, parquet_path, dst_url, upstream_tasks=[info]) add_entry(entry, info, catalog_path, upstream_tasks=[status]) else: raise NotImplementedError( 'Integration of data directories (rather than single files) not yet implemented' ) return flow
from prefect import Flow, task from prefect.engine.results import LocalResult @task def a(): return None @task def b(foo): print("noooo") print(foo) # return 1/0 result = LocalResult(location="{flow_name}/" "{scheduled_start_time:%d-%m_%H-%M-%S}/" "{task_full_name}-{task_run_id}.prefect_result", ) with Flow( name="results_issue", result=result, storage=prefect.environments.storage.Local( stored_as_script=True, path="/Users/josh/Desktop/code/Dummy-Flows/res_issue.py"), ) as flow: a = a() b = b(a) # flow.register("Demo")