Exemplo n.º 1
0
def main():
    SOURCE = os.path.abspath('/Users/kenny.ells/data/dubai_his_arg.20191226')
    TARGET = os.path.abspath('tmp')
    FILES = sorted([os.path.join(SOURCE, f) for f in os.listdir(SOURCE)])
    FILES = FILES[:5]

    if not os.path.exists(TARGET):
        os.mkdir(TARGET)

    with Flow('plotting') as flow:
        plot.plot_roms.map(ncfile=FILES,
                           target=unmapped(TARGET),
                           varname=unmapped('temp'))

    # When calling dask-scheduler and dask-worker at the command line
    # address_tcp = 'tcp://10.90.69.73:8786'
    # print(address_tcp)
    # executor = DaskExecutor(address=address_tcp, local_processes=True, debug=True)

    # Defining the client in the script
    # client = Client()
    # print(client)
    # address_tcp = client.scheduler.address
    # executor = DaskExecutor(address=address_tcp)
    # flow.run(executor=executor)

    # Ignoring Dask
    flow.run()
    def flow(self):

        if len(self.targets) == 1:
            target = self.targets[0]
        else:
            raise ValueError(
                "Zarr target requires self.targets be a length one list")

        with Flow(self.name,
                  storage=self.storage,
                  environment=self.environment) as _flow:
            # download to cache
            nc_sources = download.map(
                self.sources,
                cache_location=unmapped(self.cache_location),
            )

            # convert cached netcdf data to zarr
            cached_sources = nc2zarr.map(
                nc_sources,
                cache_location=unmapped(self.cache_location),
            )

            # combine all datasets into a single zarr archive
            combine_and_write(cached_sources, target)

        return _flow
Exemplo n.º 3
0
def build(year: int, league_id: int, cookies: dict) -> Flow:
    """
    Flow builder with relevant tasks
    (increase modularity and abstraction)
    Args:
        year: (int) - year in which to make requests
        league_id: (int) - league id in which to make requests
        cookies: (dict) - auth cookies
    Returns:
        flow: (Flow) flow to be executed
    """
    with Flow("league_flow") as flow:
        year = Parameter("year")
        league_id = Parameter("league_id")
        cookies = Parameter("cookies")

        req = url_generator(year=year, league_id=league_id)

        meta = fetch_league_meta(base_url=req, cookies=cookies)

        fetch_team_meta.map(
            base_url=unmapped(req),
            team_id=meta["team_ids"],
            cookies=unmapped(cookies),
        )

        return flow
Exemplo n.º 4
0
def run(src_dir, dst_dir, config_path: str, debug=False):
    src_dir = Parameter("src_dir", src_dir)

    # create destination
    create_dir(dst_dir)
    dst_dir = Parameter("dst_dir", dst_dir)

    # number of workers
    config_path = Parameter("config_path", config_path)

    with Flow("inference_pipeline") as flow:
        # list tiles
        tiff_paths = find_src_files(src_dir, "h5")
        parted_tiff_paths = partition_path_list(tiff_paths, 5)

        prob_paths = infer.map(parted_tiff_paths, unmapped(config_path),
                               unmapped(dst_dir))
        prob_paths = combine_path_list(prob_paths)

    if debug:
        flow.visualize(filename="flow_debug")
    else:
        client = get_client()
        executor = DaskExecutor(address=client.scheduler.address)

        flow.run(executor=executor)
Exemplo n.º 5
0
def run(src_dir, dst_dir, debug=False):
    src_dir = Parameter("src_dir", src_dir)

    # create destination
    create_dir(dst_dir)
    dst_dir = Parameter("dst_dir", dst_dir)

    with Flow("classify_pipeline") as flow:
        # load data
        h5_paths = find_src_files(src_dir, "h5")
        info = preload_array_info(h5_paths)
        prob_map = read_prob_map.map(h5_paths, unmapped(info))

        # classify
        label = classify.map(prob_map)

        # save
        tiff_paths = build_path.map(unmapped(dst_dir), h5_paths,
                                    unmapped("tif"))
        write_tiff.map(tiff_paths, label)

    if debug:
        flow.visualize()
    else:
        client = get_client()
        executor = DaskExecutor(address=client.scheduler.address)

        flow.run(executor=executor)
Exemplo n.º 6
0
    def flow(self):
        with Flow(self.name) as _flow:
            sources = source_url.map(self.days)
            nc_sources = download.map(sources,
                                      cache_location=unmapped(
                                          self.cache_location))
            chunked = chunk(nc_sources, size=self.files_per_chunk)
            writes = combine_and_write.map(chunked,
                                           unmapped(self.target_location),
                                           unmapped(self.concat_dim))
            consolidate_metadata(writes, self.target_location)

        return _flow
Exemplo n.º 7
0
def _make_flow(specs: Iterable[CopySpec]) -> prefect.Flow:
    with prefect.Flow("Rechunker") as flow:
        # iterate over different arrays in the group
        for spec in specs:
            copy_tasks = []
            # iterate over the different stages of the array copying
            for (source, target, chunks) in split_into_direct_copies(spec):
                keys = list(chunk_keys(source.shape, chunks))
                copy_task = _copy_chunk.map(prefect.unmapped(source),
                                            prefect.unmapped(target), keys)
                copy_tasks.append(copy_task)
            # create dependence between stages
            for n in range(len(copy_tasks) - 1):
                copy_tasks[n + 1].set_upstream(copy_tasks[n])
    return flow
def main():

    with Flow("combine-purpleair-sensors") as flow:
        environment = Parameter("environment", default="staging")
        start = DateTimeParameter("start")
        interval_hour = Parameter("interval_hour", default=1)
        end = DateTimeParameter("end_inclusive")

        dts = datetime_range(start, interval_hour, end)
        client = create_purpleair_archive_client(environment)

        maybe_all_sensors_processed = extract_warehouse_purpleair_processed.map(
            dt=dts, purpleair_client=unmapped(client))
        all_sensors_processed = filter_failed(maybe_all_sensors_processed)

        combined_sensors = combine_sensors(all_sensors_processed)

        blob_client = create_combined_sensors_blob_client(environment)
        load_combined_sensors(combined_sensors, blob_client)

    # Registers flow to server, which we can then deploy and run in background agents.
    # flow.register(project_name="caqi-flows")

    # Immediately executes without agents
    from datetime import datetime
    flow.run(start=datetime(2020, 11, 23, 7),
             end_inclusive=datetime(2020, 11, 25, 8))
Exemplo n.º 9
0
    def to_prefect(self):
        """Compile the recipe to a Prefect.Flow object."""
        from prefect import Flow, task, unmapped

        has_cache_inputs = getattr(self, "cache_inputs", False)
        if has_cache_inputs:
            cache_input_task = task(self.cache_input, name="cache_input")
        prepare_target_task = task(self.prepare_target, name="prepare_target")
        store_chunk_task = task(self.store_chunk, name="store_chunk")
        finalize_target_task = task(self.finalize_target,
                                    name="finalize_target")

        with Flow("pangeo-forge-recipe") as flow:
            if has_cache_inputs:
                cache_task = cache_input_task.map(
                    input_key=list(self.iter_inputs()))
                upstream_tasks = [cache_task]
            else:
                upstream_tasks = []
            prepare_task = prepare_target_task(upstream_tasks=upstream_tasks)
            store_task = store_chunk_task.map(
                chunk_key=list(self.iter_chunks()),
                upstream_tasks=[unmapped(prepare_task)],
            )
            _ = finalize_target_task(upstream_tasks=[store_task])

        return flow
Exemplo n.º 10
0
    def test_map_in_local_flow_run(self, MockFlowView, MockClient):
        MockFlowView.from_id.return_value.flow_id = "flow-id"

        with prefect.Flow("test") as flow:
            create_flow_run.map(
                flow_id=prefect.unmapped("flow-id"), labels=["a", "b", "c"]
            )

        flow.run()

        assert MockClient().create_flow_run.call_count == 3
        seen_idempotency_keys = set()
        for i, (expected_label, call) in enumerate(
            zip(["a", "b", "c"], MockClient().create_flow_run.calls)
        ):
            # Label is mapped over
            _, kwargs = call.args
            assert kwargs["label"] == expected_label

            # Idempotency keys are unique
            assert kwargs["idempotency_key"] not in seen_idempotency_keys
            seen_idempotency_keys.add(kwargs["idempotency_key"])

            # Idempotency keys include map index
            assert kwargs["idempotency_key"].endswith(f"-{i}")
Exemplo n.º 11
0
    def _build(self, *, base_url=DEFAULT_BASE_URL, form_id=None, **kwargs):

        fetch = FetchResponses(
            base_url=base_url,
            form_id=form_id,
            force=True,  # this task should always run!
        )
        save = SaveResponse(form_id=form_id)
        get_token = GetItem(name='GetResponseID', )
        get_user_hash = GetUserHash()
        add_protocol_metadata = AddStaticMetadata(
            new_meta={
                'protocol': {
                    'name': 'vr-questionnaire',
                    'extra': {
                        'form_id': form_id,
                    },
                }
            })
        add_user_metadata = AddDynamicMetadata(key=('omind', 'user_hash'), )
        report = Report()
        notify = SlackTask(
            preamble='Download of typeform responses finished.\nTask report:')

        with self:
            responses = fetch()
            response_id = get_token.map(task_result=responses,
                                        key=unmapped('response_id'))
            user_hash = get_user_hash.map(response=responses)
            files = save.map(response=responses, response_id=response_id)
            files_with_protocol = add_protocol_metadata.map(file=files)
            files_with_hash = add_user_metadata.map(file=files_with_protocol,
                                                    value=user_hash)
            message = report(files=files_with_hash)
            notify(message=message)
Exemplo n.º 12
0
def test_read_vault_secret(mocker):  # noqa: F811
    mocker.patch.object(vault_secrets, 'open')
    mocker.patch.object(vault_secrets.hvac, 'Client')
    with Flow("test") as f:
        secret_val = vault_secrets.VaultKVSecret(
            path="warehouses/test_platform/test_secret", version=2)
        get_val(unmapped(secret_val))
    state = f.run()
    assert state.is_successful()
Exemplo n.º 13
0
def get_task_kwargs(op, ref, maps):
    new_kwargs = {}
    for k, v in op.get('kwargs', {}).items():
        if isinstance(v, str) and v.startswith(':'):
            v = ref[v[1:]]
        else:
            v = Constant(v)
        if k not in maps:
            v = unmapped(v)
        new_kwargs[k] = v
    return new_kwargs
Exemplo n.º 14
0
def main():

    with Flow("reprocess-purpleair") as flow:
        environment = Parameter("environment", default="staging")
        start = DateTimeParameter("start")
        interval_hour = Parameter("interval_hour", default=1)
        end = DateTimeParameter("end_inclusive")

        dts = datetime_range(start, interval_hour, end)
        client = create_purpleair_archive_client(environment)

        all_sensors_raw = extract_warehouse_purpleair.map(
            dt=dts, purpleair_client=unmapped(client))
        all_sensors_processed = transform_all_sensors_raw.map(all_sensors_raw)
        blob_client = create_hour_blob_client.map(
            environment=unmapped(environment), dt=dts)
        load_all_sensors_processed.map(all_sensors_processed, blob_client)

    # Registers flow to server, which we can then deploy and run in background agents.
    flow.register(project_name="caqi-flows")
Exemplo n.º 15
0
 def pipelines_to_plan(self, pipelines: ParallelPipelines) -> Flow:
     with Flow("rechunker") as flow:
         for pipeline in pipelines:
             upstream_tasks = []  # type: List[task]
             for stage in pipeline.stages:
                 stage_task = task(stage.function, name=stage.name)
                 if stage.mappable is not None:
                     stage_task_called = stage_task.map(
                         list(stage.mappable
                              ),  # prefect doesn't accept a generator
                         config=unmapped(pipeline.config),
                         upstream_tasks=[
                             unmapped(t) for t in upstream_tasks
                         ],
                     )
                 else:
                     stage_task_called = stage_task(
                         config=pipeline.config,
                         upstream_tasks=upstream_tasks)
                 upstream_tasks = [stage_task_called]
     return flow
Exemplo n.º 16
0
def run(src_dir, dst_dir, debug=False):
    src_dir = Parameter("src_dir", src_dir)

    # create destination
    create_dir(dst_dir)
    dst_dir = Parameter("dst_dir", dst_dir)

    with Flow("convert_pipeline") as flow:
        # load data
        tiff_paths = find_src_files(src_dir, "tif")
        info = preload_array_info(tiff_paths)
        raw_data = read_tiff.map(tiff_paths, unmapped(info))

        # save as zarr for faster access
        zarr_paths = build_path.map(unmapped(dst_dir), tiff_paths,
                                    unmapped("zarr"))
        zarr_paths = write_zarr.map(zarr_paths, raw_data, unmapped("raw"))

        # convert
        h5_paths = build_path.map(unmapped(dst_dir), zarr_paths,
                                  unmapped("h5"))
        zarr_to_h5.map(zarr_paths, h5_paths)

    if debug:
        flow.visualize()
    else:
        client = get_client()
        executor = DaskExecutor(address=client.scheduler.address)

        flow.run(executor=executor)
def get_flow():
    with Flow(name='backfill-flow') as flow:
        start_date = Parameter('start_date', default='2020-01-01')
        end_date = Parameter('end_date', default='2020-02-01')
        tick_type = Parameter('tick_type', default='trades')
        symbols = Parameter('symbols', default=['GLD'])

        symbol_date_list = get_remaining_symbol_dates(start_date, end_date,
                                                      symbols, tick_type)

        backfill_date_task_result = backfill_date_task.map(
            symbol_date=symbol_date_list, tick_type=unmapped(tick_type))
    return flow
Exemplo n.º 18
0
    def flow(self):
        with Flow(self.name) as flow:
            # Use map the `source_url` task to each day. This returns a mapped output,
            # a list of string URLS. See
            # https://docs.prefect.io/core/concepts/mapping.html#prefect-approach
            # for more. We'll have one output URL per day.
            sources = source_url.map(self.days)

            # Map the `download` task (provided by prefect) to download the raw data
            # into a cache.
            # Mapped outputs (sources) can be fed straight into another Task.map call.
            # If an input is just a regular argument that's not a mapping, it must
            # be wrapepd in `prefect.unmapped`.
            # https://docs.prefect.io/core/concepts/mapping.html#unmapped-inputs
            # nc_sources will be a list of cached URLs, one per input day.
            nc_sources = download.map(sources,
                                      cache_location=unmapped(
                                          self.cache_location))

            # The individual files would be a bit too small for analysis. We'll use
            # pangeo_forge.utils.chunk to batch them up. We can pass mapped outputs
            # like nc_sources directly to `chunk`.
            chunked = pangeo_forge.utils.chunk(nc_sources, size=5)

            # Combine all the chunked inputs and write them to their final destination.
            writes = combine_and_write.map(
                chunked,
                unmapped(self.target_location),
                append_dim=unmapped("time"),
                concat_dim=unmapped("time"),
            )

            # Consolidate the metadata for the final dataset.
            consolidate_metadata(self.target_location, writes=writes)

        return flow
Exemplo n.º 19
0
    def _build(self, *, base_url=DEFAULT_BASE_URL, form_id=None, **kwargs):
        required_families = dict(
            iguazu=None,
            omind=None,
            protocol=None,
            standard=None,
        )
        families = kwargs.get('families', {}) or {
        }  # Could be None by default args
        for name in required_families:
            families.setdefault(name, required_families[name])
        kwargs['families'] = families

        # When the query is set by kwargs, leave the query and dialect as they
        # come. Otherwise, set to the default defined just above
        if not kwargs.get('query', None):
            kwargs['query'] = self.DEFAULT_QUERY
            kwargs['dialect'] = 'postgresql_json'

        # First part of this flow: obtain a dataset of files
        dataset_flow = GenericDatasetFlow(**kwargs)

        json_files = dataset_flow.terminal_tasks().pop()
        self.update(dataset_flow)

        create_flow_metadata = CreateFlowMetadata(flow_name=self.REGISTRY_NAME)
        read_json = LoadJSON()
        read_form = GetForm(form_id=form_id, base_url=base_url)
        extract_scores = ExtractScores(
            output_hdf5_key='/iguazu/features/typeform/subject', )
        # TODO: propagate metadata when the branch that has that task is merged
        propagate_metadata = PropagateMetadata(
            propagate_families=['omind', 'protocol'])
        update_flow_metadata = UpdateFlowMetadata(flow_name=self.REGISTRY_NAME)

        with self:
            create_noresult = create_flow_metadata.map(parent=json_files)
            form = read_form()
            responses = read_json.map(file=json_files,
                                      upstream_tasks=[create_noresult])
            scores = extract_scores.map(parent=json_files,
                                        response=responses,
                                        form=unmapped(form))
            scores_with_metadata = propagate_metadata.map(parent=json_files,
                                                          child=scores)
            _ = update_flow_metadata.map(parent=json_files,
                                         child=scores_with_metadata)
Exemplo n.º 20
0
    def to_prefect(self):
        """Compile the recipe to a Prefect.Flow object."""
        from prefect import Flow, task, unmapped

        # TODO: allow recipes to customize which stages to run
        cache_input_task = task(self.cache_input, name="cache_input")
        prepare_target_task = task(self.prepare_target, name="prepare_target")
        store_chunk_task = task(self.store_chunk, name="store_chunk")
        finalize_target_task = task(self.finalize_target, name="finalize_target")

        with Flow("pangeo-forge-recipe") as flow:
            cache_task = cache_input_task.map(input_key=list(self.iter_inputs()))
            upstream_tasks = [cache_task]
            prepare_task = prepare_target_task(upstream_tasks=upstream_tasks)
            store_task = store_chunk_task.map(
                chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)],
            )
            _ = finalize_target_task(upstream_tasks=[store_task])

        return flow
Exemplo n.º 21
0
    def flow(self):

        with Flow(self.name,
                  storage=self.storage,
                  environment=self.environment) as _flow:
            # download to cache
            nc_sources = download.map(
                self.sources,
                cache_location=unmapped(self.cache_location),
            )

            first = True
            write_tasks = []
            for source_group in chunked_iterable(nc_sources,
                                                 self.files_per_chunk):
                write_task = combine_and_write(source_group,
                                               self.target_location,
                                               self.concat_dim,
                                               first=first)
                write_tasks.append(write_task)
                first = False
            cm = consolidate_metadata(target_path)

        return _flow
Exemplo n.º 22
0
        "Meta Data": meta_data,
        "Time Series (15min)": data
    })


@task
def persist_data_in_influx(injector: Injector,
                           av_response: InterdayResponseModel,
                           secrets: Dict[str, str]):
    influx_v2_client = injector.get(InfluxDBClient)
    influx_v2_client.write_api(SYNCHRONOUS).write(
        secrets['INFLUX_V2_BUCKET'],
        record=interday_response_model_to_points(av_response))


schedule = IntervalSchedule(interval=timedelta(hours=24))

with Flow("scrap-stock", schedule) as flow:
    injector = create_secret_injector_task()
    token_renewal_result = renew_token_task(injector)
    secrets = fetch_secret_task('common', 'kv', injector)
    stocks = Parameter("stocks", default=["GOOGL", "MSFT"])
    av_response = scrap_stock.map(stocks, secrets=unmapped(secrets))
    persist_data_in_influx.map(injector=unmapped(injector),
                               av_response=av_response,
                               secrets=unmapped(secrets))

flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py")

flow.run()
Exemplo n.º 23
0
def generate_movies(
    img: Union[str, Path],
    distributed_executor_port: Optional[Union[str, int]] = None,
    save_path: Optional[Union[str, Path]] = None,
    operating_dim: str = Dimensions.Time,
    overwrite: bool = False,
    fps: int = 12,
    quality: int = 6,
    save_format: str = "mp4",
    save_workflow: bool = False,
    normalization_func: Callable = single_channel_percentile_norm,
    normalization_kwargs: Dict[str, Any] = {},
    projection_func: Callable = single_channel_max_project,
    projection_kwargs: Dict[str, Any] = {},
    S: Optional[Union[int, slice]] = None,
    C: Optional[Union[int, slice]] = None,
    B: Union[int, slice] = 0,
) -> Path:
    """
    Generate a movie for every scene and channel pair found in a file through an
    operating dimension.

    Parameters
    ----------
    img: Union[str, Path]
        Path to a CZI file to read and generate movies for.
    distributed_executor_port: Optional[Union[str, int]]
        If provided a port to use for connecting to the distributed scheduler. All image
        computation and workflow tasks will be distributed using Dask.
        Default: None
    save_path: Optional[Union[str, Path]]
        A specific path to save the generated movies to.
        Default: The a directory named after the provided file.
    operating_dim: str
        Which dimension to operating through for each frame of the movie.
        Default: Dimensions.Time ("T")
    overwrite: bool
        Should existing files found under the same directory name be overwritten.
        Default: False
    fps: int
        Frames per second of each produces movie.
        Default: 12
    quality: int
        ImageIO's compression system. 0 is high compression, 10 is no compression.
        Default: 6
    save_format: str
        Which movie format should be used for each produced file.
        Default: mp4
        Available: mov, avi, mpg, mpeg, mp4, mkv, wmv
    save_workflow: bool
        Optionally, save a PNG and PDF of the workflow that ran.
        If this is set to True, be sure you have installed graphviz and added
        it's executable to your PATH.
        Default: False
    normalization_func: Callable
        A function to normalize the entire movie data prior to projection.
        Default: timelapse_tools.normalization.single_channel_percentile_norm
    normalization_kwargs: Dict[str, Any]
        Any extra arguments to pass to the normalization function.
        Default: {}
    projection_func: Callable
        A function to project the data for at each frame of the movie.
        Default: timelapse_tools.projection.single_channel_max_project
    projection_kwargs: Dict[str, Any]
        Any extra arguments to pass to the projection function.
        Default: {}
    S: Optional[Union[int, slice]]
        A specific integer or slice to use for selecting down the scenes to process.
        Default: None (process all scenes)
    C: Optional[Union[int, slice]]
        A specific integer or slice to use for selecting down the channels to process.
        Default: None (process all channels)
    B: Union[int, slice]
        A specific integer or slice to use for selecting down the channels to process.
        Default: 0
    Returns
    -------
    save_path: Path
        The path to the produced scene-channel pairings of movies.
    """
    if distributed_executor_port:
        from prefect.engine.executors import DaskExecutor

        executor = DaskExecutor(
            address=f"tcp://localhost:{distributed_executor_port}")
    else:
        from prefect.engine.executors import LocalExecutor

        executor = LocalExecutor()

    # Run all processing through prefect + dask for better
    # parallelization and task optimization
    with Flow("czi_to_mp4_conversion") as flow:
        # Convert img to Path
        img = Path(img).expanduser().resolve(strict=True)

        # Determine save path
        save_path = _get_save_path(save_path=save_path,
                                   overwrite=overwrite,
                                   fname=img.with_suffix("").name)

        # Setup and check image and operating dimension provided
        img_details = _img_prep(
            img=img,
            operating_dim=operating_dim,
            # Don't run if save path checking failed
            upstream_tasks=[save_path],
        )

        # Select scene data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.Scene,
            dim_indicies_selected=S,
        )

        # Select channel data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.Channel,
            dim_indicies_selected=C,
        )

        # Select 'B' data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.B,
            dim_indicies_selected=B,
        )

        # Generate all the indicie sets we will need to process
        getitem_indicies = _generate_getitem_indicies(
            img_shape=_get_image_shape(img_details[0]), dims=img_details[1])

        # Generate all the movie selections
        to_process = _generate_process_list(img=img_details[0],
                                            getitem_indicies=getitem_indicies)

        # Generate a list of dictionaries that map dimension to selected data
        selected_indices = _generate_selected_dims_list(
            dims=img_details[1], getitem_indicies=getitem_indicies)

        # Generate movies for each
        _generate_movie.map(
            data=to_process,
            selected_indices=selected_indices,
            dims=unmapped(img_details[1]),
            operating_dim=unmapped(operating_dim),
            save_path=unmapped(save_path),
            fps=unmapped(fps),
            save_format=unmapped(save_format),
            normalization_func=unmapped(normalization_func),
            normalization_kwargs=unmapped(normalization_kwargs),
            projection_func=unmapped(projection_func),
            projection_kwargs=unmapped(projection_kwargs),
        )

    # Run the flow
    state = flow.run(executor=executor)

    # Get resulting path
    save_path = state.result[flow.get_tasks(name="_get_save_path")[0]].result

    # Save the flow viz to the same save_path
    if save_workflow:
        flow.visualize(filename=str(save_path / "workflow.png"))

    return save_path
        result_handler=GCSResultHandler(bucket='prefect_results')) as flow:
    _url = Parameter("url", default='http://www.insidethex.co.uk/')
    _bypass = Parameter("bypass", default=False, required=False)
    _db_file = Parameter("db_file", default='xfiles_db.sqlite', required=False)

    # scrape the website
    _home_page = retrieve_url(_url)
    _episodes = create_episode_list(base_url=_url,
                                    main_html=_home_page,
                                    bypass=_bypass)
    _episode = retrieve_url.map(_episodes)
    _dialogue = scrape_dialogue.map(_episode)

    # insert into SQLite table
    _db = create_db(filename=_db_file)
    _final = insert_episode.map(episode=_dialogue, tbl=unmapped(_db))

if __name__ == '__main__':
    # debug the local execution of the flow
    import sys
    import argparse
    from prefect.utilities.debug import raise_on_exception

    # get any CLI arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--visualize', required=False, default=False)
    parser.add_argument('--deploy', required=False, default=False)
    p = parser.parse_args(sys.argv[1:])

    if p.visualize:
        # view the DAG
Exemplo n.º 25
0
    },
    python_dependencies=[
        "python-dotenv",
        "boto3",
        "botocore",
    ],
    ignore_healthchecks=True,
    # only an extreme poweruser should use this ^
)
run_config = DockerRun(
    env={"sample_key": "sample_value"},
    labels=["docker"],
)

with Flow(
    "Upload to S3", 
    storage=storage, 
    run_config=run_config
) as flow:
    files_to_download = Parameter(
        name="File List", 
        default=["data/test_data.csv", "data/user_data.csv", "data/event_data.csv"]
    )
    conn = connect_to_s3()
    upload_to_s3.map(
        s3_client=unmapped(conn), 
        file_path=create_filepath.map(files_to_download)
    )

# flow.run()
flow.register(project_name="AWS")
Exemplo n.º 26
0
def create_event_index_pipeline(
    config: EventIndexPipelineConfig,
    n_grams: int = 1,
    store_local: bool = False,
) -> Flow:
    """
    Create the Prefect Flow object to preview, run, or visualize for indexing
    all events in the database.

    Parameters
    ----------
    config: EventIndexPipelineConfig
        Configuration options for the pipeline.
    n_grams: int
        N number of terms to act as a unique entity. Default: 1
    store_local: bool
        Should the generated index be stored locally to disk or uploaded to database.
        Storing the local index is useful for testing search result rankings with the
        `search_cdp_events` bin script.
        Default: False (store to database)

    Returns
    -------
    flow: Flow
        The constructed CDP Event Index Pipeline as a Prefect Flow.
    """
    with Flow("CDP Event Index Pipeline") as flow:
        # Ensure stopwords are downloaded
        # Do this once to ensure that we don't enter a race condition
        # with multiple workers trying to download / read overtop one another
        # later on.
        try:
            from nltk.corpus import stopwords

            stopwords.words("english")
        except LookupError:
            import nltk

            nltk.download("stopwords")
            log.info("Downloaded nltk stopwords")
            from nltk.corpus import stopwords

            stopwords.words("english")

        # Get all transcripts
        all_transcripts = get_transcripts(
            credentials_file=config.google_credentials_file)

        # Select highest confidence transcript for each session
        selected_transcripts = get_highest_confidence_transcript_for_each_session(
            transcripts=all_transcripts)

        # Get all transcripts for each event (multi-session events)
        event_transcripts = get_transcripts_per_event(
            transcripts=selected_transcripts)

        # Read all transcripts for each event and generate grams
        all_event_transcript_n_grams = read_transcripts_and_generate_grams.map(
            event_transcripts=event_transcripts,
            n_grams=unmapped(n_grams),
            credentials_file=unmapped(config.google_credentials_file),
        )

        # Convert to dataframe for tfidf calc
        all_events_n_grams = convert_all_n_grams_to_dataframe(
            all_events_n_grams=all_event_transcript_n_grams, )

        # Weighted n grams by tfidf
        scored_n_grams = compute_tfidf(
            n_grams=all_events_n_grams,
            datetime_weighting_days_decay=config.datetime_weighting_days_decay,
        )

        # Route to local storage task or remote bulk upload
        if store_local:
            store_local_index(n_grams_df=scored_n_grams, n_grams=n_grams)

        # Route to remote database storage
        else:
            chunked_scored_n_grams = chunk_n_grams(scored_n_grams)
            store_n_gram_chunk.map(
                n_gram_chunk=chunked_scored_n_grams,
                credentials_file=unmapped(config.google_credentials_file),
            )

    return flow
Exemplo n.º 27
0
with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl:
    
    tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html")
    imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst")
    n_conn = Parameter("n_conn", default=1) 
    
    # Scrape the website
    tsx_imb_df = get_tsx_moc_imb(tsx_url)

    # Get the connection string from prefect cloud 
    conn_str = PrefectSecret("moc_pgdb_conn")
    
    # Partition the df to 
    tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn)

    df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str))

if __name__ == "__main__":

    # Inputs
    tsx_url = 'https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html'
    backup_url = "https://web.archive.org/web/20200414202757/https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html"

    # Script
    from prefect.engine.executors import LocalExecutor

    tsx_imb_fl.visualize()
    fl_state = tsx_imb_fl.run(
        parameters=dict(
            tsx_url=backup_url,
            n_conn=4
Exemplo n.º 28
0
def _define_model_selection_flow():
    """Define flow that runs model selection.

    Specifically data filtering, partitioning and model selection
    and optional persistence on a given dataset

    Returns
    -------
    prefect.Flow
    """

    from prefect import task, Flow, Parameter, unmapped

    with Flow("model selection") as flow:
        df = Parameter("data")
        grid_search = Parameter("grid_search")
        target_col_name = Parameter("target_col_name")
        country_code_column = Parameter("country_code_column")
        include_rules = Parameter("include_rules")
        exclude_rules = Parameter("exclude_rules")
        parallel_over_columns = Parameter("parallel_over_columns")
        partition_columns = Parameter("partition_columns")
        frequency = Parameter("frequency")
        output_path = Parameter("output_path")
        persist_cv_data = Parameter("persist_cv_data")
        persist_cv_results = Parameter("persist_cv_results")
        persist_model_reprs = Parameter("persist_model_reprs")
        persist_best_model = Parameter("persist_best_model")
        persist_partition = Parameter("persist_partition")
        persist_model_selector_results = Parameter(
            "persist_model_selector_results")
        df_filtered = task(filter_data)(df=df,
                                        include_rules=include_rules,
                                        exclude_rules=exclude_rules)

        partitions = task(partition_data)(df=df_filtered,
                                          partition_by=parallel_over_columns)

        parallel_over_dicts, partition_dfs = partitions["labels"], partitions[
            "data"]

        train_data = task(prepare_data_for_training).map(
            df=partition_dfs,
            frequency=unmapped(frequency),
            partition_columns=unmapped(partition_columns),
            parallel_over_columns=unmapped(parallel_over_columns),
            country_code_column=unmapped(country_code_column),
        )
        results = task(select_model).map(
            df=train_data,
            target_col_name=unmapped(target_col_name),
            grid_search=unmapped(grid_search),
            partition_columns=unmapped(partition_columns),
            parallel_over_dict=parallel_over_dicts,
            frequency=unmapped(frequency),
            country_code_column=unmapped(country_code_column),
        )

        write_ok = task(persist_experts_in_physical_partition).map(
            results=results,
            folder_path=unmapped(output_path),
            persist_cv_results=unmapped(persist_cv_results),
            persist_cv_data=unmapped(persist_cv_data),
            persist_model_reprs=unmapped(persist_model_reprs),
            persist_best_model=unmapped(persist_best_model),
            persist_partition=unmapped(persist_partition),
            persist_model_selector_results=unmapped(
                persist_model_selector_results),
        )

    flow.set_reference_tasks([write_ok])

    return flow
Exemplo n.º 29
0

def DoNotLikeEven(Exception):
    pass


@task(name="multiply input if even",
      max_retries=1,
      retry_delay=timedelta(seconds=5))
def transform(x: int, factor: int) -> int:
    """Multiply the input by `factor`"""
    if (x % 2) == 0:
        raise DoNotLikeEven(f'Do not like even numbers and received {x}')
    return x * factor


@task(trigger=some_successful(at_least=1, at_most=6),
      state_handlers=[slack_notifier])
def load(data: list):
    """Print the data to indicate it was received"""
    print("Here's your output data: {}".format(data))


# Set dependency graph
with Flow('ETL') as flow:
    e = extract()
    t = transform.map(e, unmapped(factor))
    l = load(t)

# with prefect.context(secrets=dict(SLACK_WEBHOOK_URL="https://hooks.slack.com/services/XXX" )):
#     flow.run()quit()
def download_cdp_dataset(args: Args):
    # Try running the download pipeline
    try:
        # Get instance config
        instance_config = getattr(configs, args.instance_name.upper())

        # Create connection to instance
        cdp_instance = CDPInstance(instance_config)

        # Get speaker annotated transcripts
        sats = cdp_instance.database.select_rows_as_list(
            "transcript", [("confidence", 0.97)])

        # Spawn local dask cluster
        cluster = LocalCluster()

        # Log dashboard link
        log.info(f"Dashboard available at: {cluster.dashboard_link}")

        # Setup workflow
        with Flow("get_dataset") as flow:
            # Download videos
            video_paths = _download_video.map(
                [sat["event_id"] for sat in sats],
                unmapped(cdp_instance.database), unmapped(args.save_dir),
                unmapped(args.overwrite))

            # Split audio from video
            audio_paths = _split_audio_from_video.map(video_paths,
                                                      unmapped(args.overwrite))

            # Download transcripts
            transcript_paths = _download_transcript.map(
                [sat["event_id"] for sat in sats],
                unmapped(cdp_instance.database),
                unmapped(cdp_instance.file_store), unmapped(args.save_dir),
                unmapped(args.overwrite))

            # Create large audio manifest
            events = _generate_initial_download_manifest(
                [sat["event_id"] for sat in sats], video_paths, audio_paths,
                transcript_paths, args.save_dir)

            # Generate sentence splits
            manifests = _generate_splits.map(events, unmapped(args.overwrite))

            # Generate splits manifest
            _generate_splits_manifest(manifests, unmapped(args.save_dir))

        # Run the flow
        state = flow.run(executor=DaskExecutor(cluster.scheduler_address))

        # Log resulting manifest
        manifest_save_path = (state.result[flow.get_tasks(
            name="_generate_splits_manifest")[0]].result)
        log.info(f"Dataset manifest stored to: {manifest_save_path}")

    # Catch any exception
    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)