Exemplos de Stage em Python, exemplos de parallelpipe.Stage em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def preprocessed_tile_pipe(tile_ids, **kwargs):
    """
    Pipeline to download/ process GLAD alerts of previous years
    :param tile_ids: List of Tile IDs to process
    :param kwargs: Dictonary with keyword arguments
    :return: pipe
    """
    workers = kwargs["workers"]
    pipe = (
        tile_ids
        | Stage(download_preprocessed_tiles_years, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(download_preprocessed_tiles_year, name="download", **
                kwargs).setup(workers=workers)
        | Stage(change_pixel_depth, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(encode_date_conf, name="encode_day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_pairs).setup(workers=1)  # !Important
        | Stage(combine_date_conf_pairs, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf, **kwargs).setup(workers=1)  # Important
        | Stage(merge_years, name="day_conf", **kwargs).setup(workers=workers)
        | Stage(upload_preprocessed_tiles_s3, **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Preprocess output: " + str(output))
    logging.info("Preprocess - Done")

    return

Exemplo n.º 2

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def date_conf_merge_pipe(tile_ids, **kwargs):
    """
    Pipeline to process latest GLAD alerts
    :param tile_ids: List of Tile IDs to process
    :param kwargs: Dictonary with keyword arguments
    :return: pipe
    """
    workers = kwargs["workers"]

    pipe = (
        tile_ids
        | Stage(download_latest_tiles, name="download", **
                kwargs).setup(workers=workers)
        | Stage(change_pixel_depth, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(upload_raw_tile_s3, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(encode_date_conf, name="encode_day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_pairs).setup(workers=1)  # Important!
        | Stage(combine_date_conf_pairs, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_all_years, **kwargs).setup(
            workers=1)  # Important!
        | Stage(merge_years, name="day_conf", **kwargs).setup(workers=workers)
        | Stage(upload_day_conf_s3_archive, **kwargs).setup(workers=workers))

    date_conf_tiles = list()
    for output in pipe.results():
        date_conf_tiles.append(output)
        logging.debug("Date Conf  output: " + str(output))
    logging.info("Date Conf - Done")

    return date_conf_tiles

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_three_reduce(self):
        """Producer/Mapper/Reducer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        mapper = Stage(t2, 5).setup(workers=4, qsize=1000)
        reducer = Stage(t3, sum).setup(workers=2, qsize=3)
        pipe = producer | mapper | reducer
        res = list(t for t in pipe.results())

        expected = sum(range(5, 1005)) * 4

        self.assertEquals(len(res), 2)
        self.assertEquals(sum(res), expected)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_two_class_instance(self):
        """Producer/Consumer configuration. One of the task is actually a method of a class
        instance"""
        job = T2(5).produce
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        consumer = Stage(job).setup(workers=4, qsize=1000)
        pipe = producer | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_two_reduce(self):
        """Producer/Reducer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        reducer = Stage(t3, sum).setup(workers=1, qsize=3)
        pipe = producer | reducer
        res = list(t for t in pipe.results())

        expected = sum(range(1000)) * 4

        self.assertEquals(len(res), 1)
        self.assertEquals(res[0], expected)

        # let's try execute here..

        res = pipe.execute()
        self.assertEquals(res, expected)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def download_climate_data(tile_ids, **kwargs):

    workers = kwargs["workers"]

    pipe = (tile_ids
            | Stage(download_emissions,
                    name="emissions",
                    return_input=True,
                    **kwargs).setup(workers=workers)
            | Stage(download_climate_mask,
                    name="climate_mask",
                    return_input=True,
                    **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Download climate data output: " + str(output))
    logging.info("Download climate data - Done")

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_exception_propagation(self):
        """The mapper will fail this time"""
        producer = Stage(t1, range(1000)).setup(workers=2, qsize=10)
        mapper = Stage(t2, 5, 200).setup(workers=6, qsize=1000)
        reducer = Stage(t3, sum).setup(workers=2, qsize=3)
        pipe = producer | mapper | reducer

        with self.assertRaisesRegexp(TaskException, "failed at 200"):
            for res in pipe.results():
                pass

        producer = Stage(t1, range(1000), 10).setup(workers=2, qsize=10)
        pipe = producer | mapper | reducer

        with self.assertRaisesRegexp(TaskException, "failed at 10"):
            for res in pipe.results():
                pass

Exemplo n.º 8

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def rgb_pipe(**kwargs):

    root = kwargs["root"]
    workers = kwargs["workers"]

    tile_pairs = list()
    for pair in collect_resampled_tiles(root):
        tile_pairs.append(pair)

    pipe = (tile_pairs
            | Stage(encode_rgb).setup(workers=workers)
            | Stage(project).setup(workers=workers)
            | Stage(upload_rgb_wm_s3, **kwargs).setup(workers=workers))
    for output in pipe.results():
        logging.debug("RGB output: " + str(output))
    logging.info("RGB - Done")

    return

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_two(self):
        """Producer/Consumer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        consumer = Stage(t2, 5).setup(workers=4, qsize=1000)
        pipe = producer | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        pipe = range(1000) | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000)  # we are running 4 parallel producers

Exemplo n.º 10

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def resample_date_conf_pipe(tiles, **kwargs):

    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]
    pipe = tiles | Stage(resample,
                         name="day_conf",
                         resample_method="near",
                         zoom=max_zoom,
                         **kwargs).setup(workers=workers)
    for i in range(max_zoom - 1, -1, -1):
        pipe = pipe | Stage(resample,
                            name="day_conf",
                            resample_method="mode",
                            zoom=i,
                            **kwargs).setup(workers=workers)

    for output in pipe.results():
        logging.debug("Resample Day Conf output: " + str(output))
    logging.info("Resample Day Conf - Done")

    return

Exemplo n.º 11

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def tilecache_pipe(**kwargs):

    root = kwargs["root"]
    workers = kwargs["workers"]

    zoom_tiles = list()
    for pair in collect_rgb_tiles(root):
        zoom_tiles.append(pair)

    tile_ids = collect_rgb_tile_ids(zoom_tiles)

    pipe = (
        zoom_tiles
        | Stage(generate_vrt, kwargs["min_zoom"], kwargs["max_tilecache_zoom"],
                **kwargs).setup(workers=workers)
        | Stage(generate_tilecache_mapfile, **kwargs).setup(workers=workers)
        | Stage(generate_tilecache_config, **kwargs).setup(workers=workers)
        | Stage(generate_tile_list, tile_ids=tile_ids, **
                kwargs).setup(workers=workers)
        | Stage(save_tile_lists, **kwargs).setup(workers=workers)
        | Stage(generate_tiles, **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Tilecache output: " + str(output))

    upload_tilecache_s3(**kwargs)

    logging.info("Tilecache - Done")

    return

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_download_tiles.py Projeto: wri/glad_tiles_pipeline

def test_download_preprocessed_tiles_years(mock_sp, mock_path):

    preprocessed_years = [2015, 2016, 2017]
    tile_ids = ["050W_00N_040W_10N"]
    root = "/home/thomas/projects/gfw-sync/glad_tiles_pipeline/tests/data"
    name = "download"
    year_str = "_".join(str(year) for year in preprocessed_years)

    mock_path.mkdir.return_value = "home/user/data"

    output = output_file(
        root, "tiles", tile_ids[0], "date_conf", year_str, "day_conf.tif"
    )

    mock_sp.check_call.return_value = sp.CalledProcessError

    pipe = tile_ids | Stage(
        download_preprocessed_tiles_years,
        preprocessed_years=preprocessed_years,
        root=root,
        name=name,
    )

    for x in pipe.results():
        assert x == output

    # TODO: Is there a better way to check if nothing was returned?
    mock_sp.check_call.return_value = True

    pipe = tile_ids | Stage(
        download_preprocessed_tiles_years,
        preprocessed_years=preprocessed_years,
        root=root,
        name=name,
    )

    out = False
    for x in pipe.results():
        out = True
    assert out is False

Exemplo n.º 13

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def intensity_pipe(tiles, **kwargs):
    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]
    pipe = (tiles
            | Stage(unset_no_data_value).setup(workers=workers)
            | Stage(prep_intensity, name="day_conf", **
                    kwargs).setup(workers=workers)
            | Stage(resample,
                    name="intensity",
                    resample_method="near",
                    zoom=max_zoom,
                    **kwargs).setup(workers=workers))
    for i in range(max_zoom - 1, -1, -1):
        pipe = pipe | Stage(resample,
                            name="intensity",
                            resample_method="bilinear",
                            zoom=i,
                            **kwargs).setup(workers=workers)

    for output in pipe.results():
        logging.debug("Intensity output: " + str(output))
    logging.info("Intensity - Done")

    return

Exemplo n.º 14

0

Exibir arquivo

def upload_tiles(
    tile_cache: str,
    dataset: str,
    version: str,
    bucket: str = "gfw-tiles",
    implementation: str = "default",
    cores: int = CORES,
) -> None:
    """Upload a local tile cache to S3."""

    LOGGER.info(
        f"Upload tile cache to {dataset}/{version}/{implementation} using {cores} processes"
    )

    # pipe files
    pipe = get_tiles(tile_cache) | Stage(
        copy_tiles, tile_cache, bucket, dataset, version, implementation
    ).setup(workers=cores)

    # collect results
    for output in pipe.results():
        LOGGER.debug(output)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: raster_pipe.py Projeto: wri/gfw_pixetl

    def create_tiles(
        self, overwrite: bool
    ) -> Tuple[List[Tile], List[Tile], List[Tile], List[Tile]]:
        """Raster Pipe."""

        LOGGER.info("Start Raster Pipe")

        tiles = self.collect_tiles(overwrite=overwrite)

        GLOBALS.workers = max(self.tiles_to_process, 1)

        pipe = (
            tiles
            | Stage(self.transform).setup(workers=GLOBALS.workers)
            | self.upload_file
            | self.delete_work_dir
        )

        tiles, skipped_tiles, failed_tiles, existing_tiles = self._process_pipe(pipe)

        LOGGER.info("Finished Raster Pipe")
        return tiles, skipped_tiles, failed_tiles, existing_tiles

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_parallelpipe.py Projeto: gtsystem/parallelpipe

    def test_one(self):
        """Only producer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        res = list(t for t in producer.results())

        self.assertEquals(max(res), 999)
        self.assertEquals(min(res), 0)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        # let's reuse a pipe again
        res = list(t for t in producer.results())

        self.assertEquals(max(res), 999)
        self.assertEquals(min(res), 0)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        # task with one result
        producer = Stage(t3, range(1000), sum).setup(workers=4, qsize=10)
        res = producer.execute()
        self.assertEquals(res, sum(range(1000)))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: pipes.py Projeto: wri/glad_tiles_pipeline

def csv_export_pipe(**kwargs):

    root = kwargs["root"]
    years = [str(year) for year in kwargs["years"]]
    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]

    day_conf_tiles = get_preprocessed_tiles(root, include_years=years)

    columns_csv = [
        "lon",
        "lat",
        "confidence",
        "year",
        "julian_day",
        "area",
        "val1",
        "val2",
    ]

    header_csv = [
        "long",
        "lat",
        "confidence",
        "year",
        "julian_day",
        "area",
        "emissions",
        "climate_mask",
    ]

    columns_xyz = ["x", "y", "z", "alert_count", "alert_date", "confidence"]
    header_xyz = ["x", "y", "z", "alert_count", "alert_date", "confidence"]

    pipe = (day_conf_tiles
            | Stage(match_emissions, name="emissions", **
                    kwargs).setup(workers=workers)
            | Stage(match_climate_mask, name="climate_mask", **
                    kwargs).setup(workers=workers)
            | Stage(get_dataframe).setup(workers=workers)
            | Stage(decode_day_conf).setup(workers=workers)
            | Stage(save_csv,
                    name="output",
                    columns=columns_csv,
                    header=header_csv,
                    return_input=True,
                    **kwargs).setup(workers=workers)
            | Stage(upload_csv_s3, name="output", **
                    kwargs).setup(workers=workers)
            | Stage(convert_julian_date).setup(workers=workers)
            | Stage(convert_latlon_xyz, **kwargs).setup(workers=workers)
            | Stage(group_by_xyz).setup(workers=workers)
            | Stage(save_csv,
                    name="db/{}".format(max_zoom),
                    columns=columns_xyz,
                    header=header_xyz,
                    return_input=True,
                    **kwargs).setup(workers=workers))

    for i in range(max_zoom - 1, -1, -1):
        pipe = (pipe
                | Stage(convert_to_parent_xyz).setup(workers=workers)
                | Stage(group_by_xyz).setup(workers=workers)
                | Stage(save_csv,
                        name="db/{}".format(i),
                        columns=columns_xyz,
                        header=header_xyz,
                        return_input=True,
                        **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Export CSV output: " + str(output))
    logging.info("Export CSV - Done")

Exemplo n.º 18

0

Exibir arquivo

Arquivo: raster2points.py Projeto: azavea/raster2points

def raster2df(*src_rasters,
              col_names=None,
              max_block_size=4096,
              calc_area=False,
              workers=1):
    """
    Converts raster into Panda DataFrame.
    Input rasters must match cell size and extent.
    The first raster determines number of output rows.
    Only cells which are are above given Threshold/ not NoData are processed
    The tool calculates lat lon for every grid cell and extract the cell value.
    If more than one input raster is provided tool adds additional columns to CSV with coresponing values.
    :param src_rasters: Input rasters (one or many)
    :param col_names: Column names for input raster values (optional, default: val1, val2, ...)
    :param max_block_size: maximum block size to process in at once
    :param calc_area: Calculate geodesic area
    :param workers: number of parallel workers
    :return: Pandas data frame
    """

    if col_names:
        assert len(src_rasters) == len(
            col_names
        ), "Number of named columns does not match number of input rasters. Abort."

    sources = _assert_sources(src_rasters)

    src = sources[0]
    affine = src.transform
    step_height, step_width = _get_steps(src, max_block_size)

    kwargs = {
        "col_size": affine[0],
        "row_size": affine[4],
        "step_width": step_width,
        "step_height": step_height,
        "width": src.width,
        "height": src.height,
        "calc_area": calc_area,
    }

    cols = range(0, src.width, step_width)
    rows = range(0, src.height, step_height)

    blocks = itertools.product(cols, rows)

    pipe = blocks | Stage(_process_blocks, sources, **
                          kwargs).setup(workers=workers)

    data_frame = pd.DataFrame()
    for df in pipe.results():
        if data_frame.empty:
            data_frame = df[0]  # unpack data frame from tuple
        else:
            data_frame = pd.concat([data_frame, df[0]])

    if col_names:
        i = 0
        for col_name in col_names:
            data_frame = data_frame.rename(
                index=str, columns={"val{}".format(i): col_name})
            i += 1

    for src in sources:
        src.close()

    return data_frame