Python Flow 예제들, prefect.Flow Python 예제들

예제 #1

0

파일 보기

파일: run_sigla_pipeline.py 프로젝트: tohuynh/siglatools

def run_sigla_pipeline(master_spreadsheet_id: str,
                       google_api_credentials_path: str,
                       db_connection_url: str):
    """
    Run the SIGLA ETL pipeline

    Parameters
    ----------
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    db_connection_url: str
        The DB's connection url str.
    """
    # Delete all documents from db
    _clean_up(db_connection_url)
    # Create a connection to the google sheets reader
    google_sheets_institution_extracter = GoogleSheetsInstitutionExtracter(
        google_api_credentials_path)
    # Get the list of spreadsheets ids from the master spreadsheet
    spreadsheets_id = google_sheets_institution_extracter.get_spreadsheets_id(
        master_spreadsheet_id)
    log.info("Finished pipeline set up, start running pipeline")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Extract and transform") as flow:
        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheets_id,
            unmapped(google_api_credentials_path),
        )

        # Flatten the list of list of SheetData
        flattened_spreadsheets_data = _flatten_list(spreadsheets_data)

        # Transform list of SheetData into FormattedSheetData
        _transform.map(flattened_spreadsheets_data)

    # Run the extract and transform flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    formatted_sheets_data = state.result[flow.get_tasks(
        name="_transform")[0]].result
    log.info("=" * 80)
    # Partion into institutions and non-institutions sheets
    institutions = _filter_formatted_sheet_data(
        formatted_sheets_data,
        [
            gs_format.standard_institution,
            # gs_format.institution_by_rows,
            gs_format.multiple_sigla_answer_variable,
        ],
    )
    non_institutions = _filter_formatted_sheet_data(
        formatted_sheets_data,
        [
            gs_format.institution_and_composite_variable,
            gs_format.composite_variable
        ],
    )
    # Run load institutions flow
    with Flow("Load institutions") as load_institutions_flow:
        _load.map(institutions, unmapped(db_connection_url))

    load_institutions_flow.run(
        executor=DaskExecutor(cluster.scheduler_address))
    log.info("=" * 80)
    # Run load non institutions flow
    with Flow("Load non institutions") as load_non_institutions_flow:
        _load.map(non_institutions, unmapped(db_connection_url))
    load_non_institutions_flow.run(
        executor=DaskExecutor(cluster.scheduler_address))

예제 #2

0

파일 보기

파일: test_docker_healthcheck.py 프로젝트: zviri/prefect

    def test_no_raise_on_normal_flow(self):
        flow = Flow("THIS IS A TEST")

        assert healthchecks.environment_dependency_check([flow]) is None

예제 #3

0

파일 보기

from prefect import Flow, task

# latest.


@task
def extract():
    """Get a list of data"""
    return [1, 2, 3]


@task
def transform(data):
    """Multiply the input by 10"""
    return [i * 10 for i in data]


@task
def load(data):
    """Print the data to indicate it was received"""
    print("Here's your data: {}".format(data))


with Flow("sample_workflow") as flow:
    e = extract()
    t = transform(e)
    l = load(t)

예제 #4

0

파일 보기

파일: workflow.py 프로젝트: researchapps/testing-dask

    # This would equivalent to:
    # covid-world-scraper --cache-dir=$PWD/covid-cache bra
    print(f"Processing {country}")
    runner.run(cache_dir=cache_dir, headless_status=True, filter=[country])


@task(name="list_countries")
def list_countries():
    """Return list of all countries available, which will be mapped to the
       process data function (to run in parallel if possible).
    """
    runner = Runner(alert_manager=None)
    return [x.split(' ')[0] for x in runner.list_countries()]


with Flow("dask-example") as flow:
    cache_dir = create_cachedir()
    countries = list_countries()
    process_data.map(country=countries, cache_dir=unmapped(cache_dir))

# Run and visualize the flow!
flow.register(project_name="Covid-Data-Scraper")
flow_state = flow.run(executor=executor)

# GraphViz is required for this visualization
try:
    flow.visualize(flow_state=flow_state)
except Exception as err:
    print(err)

예제 #5

0

파일 보기

파일: run_qa_test.py 프로젝트: SIGLA-GU/siglatools

def run_qa_test(
    db_connection_url: str,
    google_api_credentials_path: str,
    master_spreadsheet_id: Optional[str] = None,
    spreadsheet_ids_str: Optional[str] = None,
):
    """
    Run QA test

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id.
    db_connection_url: str
        The DB's connection url str.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    spreadsheet_ids_str: Optional[str] = None
        The list of spreadsheet ids.
    """

    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("QA Test") as flow:
        # get a list of spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id,
                                               google_api_credentials_path,
                                               spreadsheet_ids_str)
        # list of list of db institutions
        db_institutions_data = _gather_db_institutions.map(
            spreadsheet_ids, unmapped(db_connection_url))
        # db institutions with their db variables and composite variable data
        db_institutions = _gather_db_variables.map(
            flatten(db_institutions_data), unmapped(db_connection_url))
        # group db institutions
        db_institutions_group = _group_db_institutions(db_institutions)

        # extract list of list of sheet data
        spreadsheets_data = _extract.map(spreadsheet_ids,
                                         unmapped(google_api_credentials_path))
        # transform to list of formatted sheet data
        formatted_spreadsheets_data = _transform.map(
            flatten(spreadsheets_data))
        # create institutional filter
        gs_institution_filter = _create_filter_task([
            GoogleSheetsFormat.standard_institution,
            GoogleSheetsFormat.multiple_sigla_answer_variable,
            GoogleSheetsFormat.institution_and_composite_variable,
        ])
        # filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(
            formatted_spreadsheets_data)
        # get list of list of gs institution
        gs_institutions = _gather_gs_institutions.map(gs_institutions_data)
        # create composite filter
        gs_composite_filter = _create_filter_task([
            GoogleSheetsFormat.composite_variable,
            GoogleSheetsFormat.institution_and_composite_variable,
        ])
        # filter to list of composite formatted sheet data
        gs_composites = gs_composite_filter(formatted_spreadsheets_data)

        # group gs institutions
        gs_institutions_group = _group_gs_institutions(
            flatten(gs_institutions))

        # compare gs institutions against db
        # get list of comparisons
        gs_institution_comparisons = _compare_gs_institution.map(
            flatten(gs_institutions), unmapped(db_institutions_group))
        # compare gs composite variables against db institutions
        # get list of list of comparisons
        gs_composite_comparisons = _compare_gs_composite_variable.map(
            gs_composites, unmapped(db_connection_url))

        # write gs institution comparisons
        _write_comparison.map(gs_institution_comparisons)
        # write gs composite comparisons
        _write_comparison.map(flatten(gs_composite_comparisons))
        # write extra db institution
        _write_extra_db_institutions(db_institutions, gs_institutions_group)

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
    # get write comparison tasks
    _write_comparison_tasks = flow.get_tasks(name="_write_comparison")
    # get the comparisons
    comparisons = [
        *state.result[_write_comparison_tasks[0]].result,
        *state.result[_write_comparison_tasks[1]].result,
    ]
    # filter to error comparisons
    gs_error_comparisons = [
        comparison for comparison in comparisons if comparison.has_error()
    ]
    # get extra db institution filename
    extra_db_institutions_filename = state.result[flow.get_tasks(
        name="_write_extra_db_institutions")[0]].result
    # write zip file
    with ZipFile("qa-test.zip", "w") as zip_file:
        for comp in gs_error_comparisons:
            zip_file.write(
                comp.get_filename(),
                f"{comp.spreadsheet_title}/{comp.sheet_title},{comp.name}",
            )
        if extra_db_institutions_filename:
            zip_file.write(extra_db_institutions_filename,
                           "extra-institutions.csv")

예제 #6

0

파일 보기

    retry_delay=timedelta(minutes=1),
    nout=2,
    trigger=triggers.all_finished,
)
def create_parquet(_success):
    ts = prefect.context.scheduled_start_time
    dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H")
    vintage_fn = FN_STR.format(dt_str) + ".parquet"
    fn = FN_STR.format("") + ".parquet"

    df = pd.read_csv(CSV_FN, parse_dates=["dt"])
    df.to_parquet(DATA_PATH / vintage_fn, index=False)
    df.to_parquet(DATA_PATH / fn, index=False)
    return vintage_fn, fn


@task
def get_gcs_cmd(fn):
    return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}"


shell = ShellTask()
with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f:
    connstr = EnvVarSecret("COVID_DB_CONN_URI")
    success = export_to_csv(connstr)
    vintage_fn, fn = create_parquet(success)
    shell(get_gcs_cmd(vintage_fn))
    shell(get_gcs_cmd(fn))

f.register(project_name="can-scrape")

예제 #7

0

파일 보기

파일: flow_tut8_sram.py 프로젝트: veloduff/ece5745-tut8-sram

]
top_name   = 'SramMinionRTL'
clk_period = 2.0 #ns

#-------------------------------------------------------------------------
# Instantiate step instances
#-------------------------------------------------------------------------

block    = Block( files, top_name, clk_period=clk_period, name='block' )
getadk   = ADK( adk_config, name='adk' )
vcd2saif = Vcd2Saif( name='vcd2saif' )
synth    = Synthesis( name='synth' )
pnr      = PlaceAndRoute( name='pnr', aspect_ratio=0.6, core_density=0.65 )
pwr      = PowerAnalysis( name='pwr' )
summary  = Summary( name='summary' )
memgen   = MemGen( name='memgen', inst_name='SRAM_32x128_1rw', word_size=32, num_words=128 )

#-------------------------------------------------------------------------
# Instantiate the flow
#-------------------------------------------------------------------------

with Flow( 'ece5745-tut8-sram' ) as flow:
  sram = memgen()
  adk = getadk()
  rtl = block()
  saif = vcd2saif( vcd=rtl )
  netlist = synth( adk=adk, verilog=rtl, saif=saif, macro=sram )
  pnr_res = pnr( adk=adk, netlist=netlist, macro=sram )
  pwr_rpt = pwr( adk=adk, saif=saif, namemap=netlist, verilog=pnr_res )
  summary( post_pnr_reports=pnr_res, power_reports=pwr_rpt, saif=saif )

예제 #8

0

파일 보기

파일: storage.py 프로젝트: zschumacher/prefect

# create task templates
upload_template = GCSUpload(bucket=BUCKET,
                            blob=BLOB,
                            credentials_secret="GCS_CREDS")
download_template = GCSDownload(bucket=BUCKET,
                                blob=BLOB,
                                credentials_secret="GCS_CREDS")
copy_template = GCSCopy(
    source_bucket=BUCKET,
    source_blob=BLOB,
    dest_bucket=BUCKET,
    credentials_secret="GCS_CREDS",
)

with Flow("GCS Example") as flow:
    # upload with default settings
    upl = upload_template(data=DATA)
    dwl = download_template(upstream_tasks=[upl])

    # upload to new blob and download it
    upl_new = upload_template(data=DATA, blob="another/blob")
    dwl_new = download_template(blob=upl_new)

    # copy the default blob twice
    cp_1 = copy_template(dest_blob="yet/another/blob", upstream_tasks=[upl])
    cp_2 = copy_template(source_blob=cp_1, dest_blob="one/last/blob")

    # download the new blob
    dwl_new = download_template(blob=cp_2)

예제 #9

0

파일 보기

def test_shell_task_raises_fail_if_cmd_fails():
    with Flow(name="test") as f:
        task = ShellTask()(command="ls surely_a_dir_that_doesnt_exist")
    out = f.run()
    assert out.is_failed()
    assert "Command failed with exit code" in str(out.result[task].message)

예제 #10

0

파일 보기

파일: test_tasks.py 프로젝트: tank0226/prefect

 def test_nested_collections(self, val):
     with Flow("test") as f:
         task = tasks.as_task(val)
         f.add_task(task)
     assert f.run().result[task].result == val

예제 #11

0

파일 보기

from prefect import Task, Flow, task


# @task
# def do_something():
#     print("something")


class DoSomething(Task):
    def run(self):
        return "asdf"


class PrintVal(Task):
    def run(self, val):
        print(val)


do_something = DoSomething()
print_val = PrintVal()

flow = Flow("demo")

print_val.set_upstream(do_something, key="val", flow=flow)

flow.run()

예제 #12

0

파일 보기

파일: test_tasks.py 프로젝트: tank0226/prefect

 def test_nested_collections_of_mixed_constants_are_not_constants(
         self, val):
     with Flow("test"):
         task = tasks.as_task(val)
     assert not isinstance(task, Constant)

예제 #13

0

파일 보기

파일: test_tasks.py 프로젝트: tank0226/prefect

    def test_as_task_toggles_constants(self):
        with Flow("test"):
            t = tasks.as_task(4)

        assert isinstance(t, Task)
        assert t.name == "4"

예제 #14

0

파일 보기

파일: test_tasks.py 프로젝트: tank0226/prefect

    def test_tasks_have_all_non_unmapped_constant_args_as_transitive_upstream_deps(
        self, ):
        def func(a, b, c, d):
            m = inc.copy(name="m").bind(1)
            n = inc.copy(name="n").bind(a)
            o = inc.copy(name="o").bind(b)
            p = add.copy(name="p").bind(n, o)
            q = add.copy(name="q").bind(c, d)
            r = add.copy(name="r").bind(q, m)
            return m, n, o, p, q, r

        with Flow("test") as flow:
            a = ranged.copy(name="a").bind(3)
            b = inc.copy(name="b").bind(1)
            c = Constant(1, name="c")
            d = Constant(range(3), name="d")
            m, n, o, p, q, r = apply_map(func,
                                         a,
                                         edges.unmapped(b),
                                         c=edges.unmapped(c),
                                         d=d)

        def edge_info(task):
            """Returns a map of {upstream: (is_data_dep, is_mapped)}"""
            return {
                e.upstream_task: (e.key is not None, e.mapped)
                for e in flow.edges_to(task)
            }

        assert edge_info(m) == {
            a: (False, True),
            b: (False, False),
            d: (False, True)
        }
        assert edge_info(n) == {
            a: (True, True),
            b: (False, False),
            d: (False, True)
        }
        assert edge_info(o) == {
            a: (False, True),
            b: (True, False),
            d: (False, True)
        }
        assert edge_info(p) == {n: (True, True), o: (True, True)}
        assert edge_info(q) == {
            a: (False, True),
            b: (False, False),
            d: (True, True)
        }
        assert edge_info(r) == {q: (True, True), m: (True, True)}

        state = flow.run()
        res = {t: state.result[t].result for t in [m, n, o, p, q, r]}
        sol = {
            m: [2, 2, 2],
            n: [1, 2, 3],
            o: [3, 3, 3],
            p: [4, 5, 6],
            q: [1, 2, 3],
            r: [3, 4, 5],
        }
        assert res == sol

예제 #15

0

파일 보기

import prefect
from prefect import task, Flow
# from prefect.executors import LocalDaskExecutor


@task
def print_contexts():
    print("foo = " + prefect.context.foo)


with Flow("test") as flow:
    t1 = print_contexts()
    t2 = print_contexts()
    t1.set_downstream(t2)

# flow.executor = LocalDaskExecutor()

with prefect.context(foo="foo") as ctx:
    flow.run(task_contexts=ctx)

예제 #16

0

파일 보기

def test_shell_initializes_with_basic_cmd():
    with Flow(name="test") as f:
        task = ShellTask(command="echo -n 'hello world'")()
    out = f.run()
    assert out.is_successful()
    assert out.result[task].result == "hello world"

예제 #17

0

파일 보기

from prefect import task, Flow


@task
def say_hello():
    print("Hello, world!")


with Flow("My First Flow") as flow:
    say_hello()

flow.run()  # "Hello, world!"

예제 #18

0

파일 보기

def test_shell_initializes_and_multiline_output_returns_last_line():
    with Flow(name="test") as f:
        task = ShellTask()(command="echo -n 'hello world\n42'")
    out = f.run()
    assert out.is_successful()
    assert out.result[task].result == "42"

예제 #19

0

파일 보기

파일: level_3.py 프로젝트: kmoonwright/evolving_etl

# Configurable value taken at runtime
length = Parameter(name="length", default=5, required=False)

# ETL Pipeline Tasks
@task(result=LocalResult(), target="{date:%A}/{task_name}.prefect")
def extract(length):
    # Extract the data
    return sample(range(100), length)

@task(max_retries=3, retry_delay=timedelta(seconds=5))
def transform(data):
    # Transform the data
    return data * 10

@task(trigger=some_successful(at_least=1, at_most=6))
def load(data):
    # Load the data
    print(f"\nHere's your data: {data}")

# Define Tasks in a Flow Context
with Flow('Evolving ETL', result=S3Result(bucket="flow-result-storage")) as flow:
    with case(length, 5):
        e = extract(length)
    with case(length, 50):
        e = extract(length)

    t = transform.map(e)
    l = load(t)

flow.run(parameters={'length': 50}) # Prints data

예제 #20

0

파일 보기

def test_shell_returns_none_if_empty_output():
    with Flow(name="test") as f:
        task = ShellTask()(command="ls > /dev/null")
    out = f.run()
    assert out.is_successful()
    assert out.result[task].result is None

예제 #21

0

파일 보기

파일: hUGe_fLow.py 프로젝트: znicholasbrown/project-schematics

from prefect import Task, Flow
from prefect.environments.storage import GitHub

with Flow("hUGe fLow") as flow:
    for i in range(2000):
        flow.add_task(Task(name=f"{i}"))

flow.storage = GitHub(
    repo="znicholasbrown/project-schematics",
    path="flows/hUGe_fLow.py",
    secrets=["GITHUB_AUTH_TOKEN"],
)

flow.register(project_name="Dev Straining")

예제 #22

0

파일 보기

def test_shell_initializes_and_multiline_output_optionally_returns_all_lines():
    with Flow(name="test") as f:
        task = ShellTask(return_all=True)(command="echo -n 'hello world\n42'")
    out = f.run()
    assert out.is_successful()
    assert out.result[task].result == ["hello world", "42"]

예제 #23

0

파일 보기

파일: state_handler_logging.py 프로젝트: roarkemc/prefect


def timestamper(task, old_state, new_state):
    """
    Task state handler which timestamps new states
    and logs the duration between state changes using
    the task's logger.
    """
    new_state.timestamp = pendulum.now("utc")
    if hasattr(old_state, "timestamp"):
        duration = (new_state.timestamp - old_state.timestamp).in_seconds()
        task.logger.info(
            "{} seconds passed in between state transitions".format(duration)
        )
    return new_state


@task(state_handlers=[timestamper])
def sleeper():
    time.sleep(2)


f = Flow("log-task-duration", tasks=[sleeper])
f.run()
# INFO - prefect.FlowRunner | Beginning Flow run for 'log-task-duration'
# INFO - prefect.FlowRunner | Starting flow run.
# INFO - prefect.TaskRunner | Task 'sleeper': Starting task run...
# INFO - prefect.Task | 2 seconds passed in between state transitions
# INFO - prefect.TaskRunner | Task 'sleeper': finished task run for task with final state: 'Success'
# INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded

예제 #24

0

파일 보기

def test_shell_raises_if_no_command_provided():
    with Flow(name="test") as f:
        ShellTask()()
    with pytest.raises(TypeError):
        with raise_on_exception():
            assert f.run()

예제 #25

0

파일 보기

파일: wf.py 프로젝트: BonaBeavis/bibtex-shacl-shapes

    return response.text


@task
def load_file(filename: str) -> str:
    with open(filename, "r", encoding="utf-8") as file:
        return file.read()


@task
def printa(stuff):
    print(stuff)


task = ShellTask(return_all=True)
with Flow("shell") as f:
    translation_server_url = "http://localhost:1969"
    bibtex = load_file("./workspace/aksw-short.bib")
    zotero = import_translation(bibtex, translation_server_url)
    rdf = export_translation(zotero, translation_server_url,
                             "rdf_bibliontology")
    turtle = task(command="rapper - -o turtle -I www.test.com > tests.ttl")
    printa(turtle)

f.run_config = DockerRun(image="prefecthq/prefect")
f.register(project_name="tutoriala")

# Configure extra environment variables for this flow,
# and set a custom image
# f.run()

예제 #26

0

파일 보기

def test_shell_runs_other_shells():
    with Flow(name="test") as f:
        task = ShellTask(shell="zsh")(command="echo -n $ZSH_NAME")
    out = f.run()
    assert out.is_successful()
    assert out.result[task].result == "zsh"

예제 #27

0

파일 보기

파일: test_flow.py 프로젝트: thomasfrederikhoeck/prefect

from prefect import Flow
from prefect.backend import FlowView
from prefect.run_configs import UniversalRun
from prefect.storage import Local

FLOW_DATA_1 = {
    "id": "id-1",
    "name": "name-1",
    "settings": {
        "key-1": "value-1"
    },
    "run_config": UniversalRun(env={
        "ENV-1": "VAL-1"
    }).serialize(),
    "serialized_flow": Flow("flow-1").serialize(),
    "archived": False,
    "project": {
        "name": "project-1"
    },
    "flow_group": {
        "labels": ["label-1"]
    },
    "core_version": "0.0.0",
    "storage": Local(stored_as_script=True, path="fake-path-1.py").serialize(),
}

FLOW_DATA_2 = {
    "id": "id-2",
    "name": "name-2",
    "settings": {

예제 #28

0

파일 보기

from prefect import task, Flow, Parameter


def run(flow, **parameters):
    state = flow.run(**parameters)
    terminal_task = list(flow.terminal_tasks())[0]
    return state.result[terminal_task].result


with Flow('Add one') as flow:
    result = Parameter('x') + 1

assert run(flow, x=1) == 2
assert run(flow, x=2) == 3
assert run(flow, x=-100) == -99

with Flow('Add x and y') as flow:
    result = Parameter('x') + Parameter('y')

assert run(flow, x=1, y=1) == 2
assert run(flow, x=40, y=2) == 42

예제 #29

0

파일 보기

def flow(source: str,
         relpath: str,
         convert: bool = True,
         output_dir: str = '/tmp/otpev',
         version: str = '20.06',
         created: Optional[str] = None,
         n_partitions: Optional[int] = None) -> Flow:
    """Get OTP evidence import flow
    
    Parameters
    ----------
    source : str
        OTP evidence source (e.g. eva, l2g, uniprot)
    relpath : str
        Path relative from `gs://open-targets-data-releases/$VERSION/input/evidence-files` to
        data file or directory (e.g. "progeny-2018-07-23.json.gz" or "evidences_protein_fix/chembl_dataset")
    output_dir : str
        Directory in which temporary json/parquet files are stored
    version : str
        OTP release version
    created: str, optional
        Date at which OTP version was created.  This should NOT
        be a time at which data was collected -- it is intended to
        reflect when OT created the release and should never change
        for the same `version`.  For this reason, `created` will
        default to known release dates (see `OT_VERSION_RELEASE_DATES`).
    n_partitions: int, optional
        Number of partitions used to write parquet result.
        Set as None to use default partitioning.

    Raises
    ------
    KeyError
        If `created` is not provided and no known release date
        was previously recorded for the specified `version`

    Returns
    -------
    Flow
        Prefect Flow
    """
    version = str(version)
    if created is None:
        if version not in OT_VERSION_RELEASE_DATES:
            raise KeyError(
                f'No release date known for version "{version}" '
                '(pass `created` explicitly or add date to `OT_VERSION_RELEASE_DATES`)'
            )
        created = OT_VERSION_RELEASE_DATES[version]

    output_dir = Path(output_dir) / source
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)

    is_file = relpath.endswith("json.gz")
    src_url = OT_URL_FMT.format(version=version) + f'/{relpath.lstrip("/")}'
    entry = get_entry(source,
                      version,
                      created,
                      format='parquet' if is_file else 'json.gz',
                      type='file' if is_file else 'directory',
                      properties=None if is_file else dict(compression='gzip'))
    catalog_path = catalog.default_urlpath()

    with Flow(f'otpev-{source}-v{version}') as flow:
        # Add constants with important to DAG (all others are not visualized)
        catalog_path = constant(catalog_path, name='catalog_path')
        dst_url = next(iter(entry.resources.values()))
        entry = constant(entry,
                         name=f'entry.key={entry_key_str(entry.key)}',
                         value=False)
        n_partitions = constant(n_partitions, name='n_partitions')
        if is_file:
            filename = src_url.split('/')[-1]
            src_url = constant(src_url, name='src_url')
            dst_url = constant(dst_url, name='dst_url')

            # Download and convert to parquet
            json_path = constant(str(output_dir / filename), name='json_path')
            parquet_path = constant(str(output_dir / filename.split('.')[0]) +
                                    '.parquet',
                                    name='parquet_path')
            json_path = download(src_url, json_path)
            info = convert_to_parquet(json_path,
                                      parquet_path,
                                      n_partitions=n_partitions)

            # Upload results
            # pylint:disable=unexpected-keyword-arg
            status = upload(entry,
                            parquet_path,
                            dst_url,
                            upstream_tasks=[info])
            add_entry(entry, info, catalog_path, upstream_tasks=[status])
        else:
            raise NotImplementedError(
                'Integration of data directories (rather than single files) not yet implemented'
            )

        return flow

예제 #30

0

파일 보기

from prefect import Flow, task
from prefect.engine.results import LocalResult


@task
def a():
    return None


@task
def b(foo):
    print("noooo")
    print(foo)
    # return 1/0


result = LocalResult(location="{flow_name}/"
                     "{scheduled_start_time:%d-%m_%H-%M-%S}/"
                     "{task_full_name}-{task_run_id}.prefect_result", )

with Flow(
        name="results_issue",
        result=result,
        storage=prefect.environments.storage.Local(
            stored_as_script=True,
            path="/Users/josh/Desktop/code/Dummy-Flows/res_issue.py"),
) as flow:
    a = a()
    b = b(a)

# flow.register("Demo")