示例#1
0
def preprocessed_tile_pipe(tile_ids, **kwargs):
    """
    Pipeline to download/ process GLAD alerts of previous years
    :param tile_ids: List of Tile IDs to process
    :param kwargs: Dictonary with keyword arguments
    :return: pipe
    """
    workers = kwargs["workers"]
    pipe = (
        tile_ids
        | Stage(download_preprocessed_tiles_years, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(download_preprocessed_tiles_year, name="download", **
                kwargs).setup(workers=workers)
        | Stage(change_pixel_depth, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(encode_date_conf, name="encode_day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_pairs).setup(workers=1)  # !Important
        | Stage(combine_date_conf_pairs, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf, **kwargs).setup(workers=1)  # Important
        | Stage(merge_years, name="day_conf", **kwargs).setup(workers=workers)
        | Stage(upload_preprocessed_tiles_s3, **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Preprocess output: " + str(output))
    logging.info("Preprocess - Done")

    return
示例#2
0
def date_conf_merge_pipe(tile_ids, **kwargs):
    """
    Pipeline to process latest GLAD alerts
    :param tile_ids: List of Tile IDs to process
    :param kwargs: Dictonary with keyword arguments
    :return: pipe
    """
    workers = kwargs["workers"]

    pipe = (
        tile_ids
        | Stage(download_latest_tiles, name="download", **
                kwargs).setup(workers=workers)
        | Stage(change_pixel_depth, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(upload_raw_tile_s3, name="pixel_depth", **
                kwargs).setup(workers=workers)
        | Stage(encode_date_conf, name="encode_day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_pairs).setup(workers=1)  # Important!
        | Stage(combine_date_conf_pairs, name="day_conf", **
                kwargs).setup(workers=workers)
        | Stage(collect_day_conf_all_years, **kwargs).setup(
            workers=1)  # Important!
        | Stage(merge_years, name="day_conf", **kwargs).setup(workers=workers)
        | Stage(upload_day_conf_s3_archive, **kwargs).setup(workers=workers))

    date_conf_tiles = list()
    for output in pipe.results():
        date_conf_tiles.append(output)
        logging.debug("Date Conf  output: " + str(output))
    logging.info("Date Conf - Done")

    return date_conf_tiles
    def test_three_reduce(self):
        """Producer/Mapper/Reducer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        mapper = Stage(t2, 5).setup(workers=4, qsize=1000)
        reducer = Stage(t3, sum).setup(workers=2, qsize=3)
        pipe = producer | mapper | reducer
        res = list(t for t in pipe.results())

        expected = sum(range(5, 1005)) * 4

        self.assertEquals(len(res), 2)
        self.assertEquals(sum(res), expected)
    def test_two_class_instance(self):
        """Producer/Consumer configuration. One of the task is actually a method of a class
        instance"""
        job = T2(5).produce
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        consumer = Stage(job).setup(workers=4, qsize=1000)
        pipe = producer | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers
    def test_two_reduce(self):
        """Producer/Reducer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        reducer = Stage(t3, sum).setup(workers=1, qsize=3)
        pipe = producer | reducer
        res = list(t for t in pipe.results())

        expected = sum(range(1000)) * 4

        self.assertEquals(len(res), 1)
        self.assertEquals(res[0], expected)

        # let's try execute here..

        res = pipe.execute()
        self.assertEquals(res, expected)
示例#6
0
def download_climate_data(tile_ids, **kwargs):

    workers = kwargs["workers"]

    pipe = (tile_ids
            | Stage(download_emissions,
                    name="emissions",
                    return_input=True,
                    **kwargs).setup(workers=workers)
            | Stage(download_climate_mask,
                    name="climate_mask",
                    return_input=True,
                    **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Download climate data output: " + str(output))
    logging.info("Download climate data - Done")
    def test_exception_propagation(self):
        """The mapper will fail this time"""
        producer = Stage(t1, range(1000)).setup(workers=2, qsize=10)
        mapper = Stage(t2, 5, 200).setup(workers=6, qsize=1000)
        reducer = Stage(t3, sum).setup(workers=2, qsize=3)
        pipe = producer | mapper | reducer

        with self.assertRaisesRegexp(TaskException, "failed at 200"):
            for res in pipe.results():
                pass

        producer = Stage(t1, range(1000), 10).setup(workers=2, qsize=10)
        pipe = producer | mapper | reducer

        with self.assertRaisesRegexp(TaskException, "failed at 10"):
            for res in pipe.results():
                pass
示例#8
0
def rgb_pipe(**kwargs):

    root = kwargs["root"]
    workers = kwargs["workers"]

    tile_pairs = list()
    for pair in collect_resampled_tiles(root):
        tile_pairs.append(pair)

    pipe = (tile_pairs
            | Stage(encode_rgb).setup(workers=workers)
            | Stage(project).setup(workers=workers)
            | Stage(upload_rgb_wm_s3, **kwargs).setup(workers=workers))
    for output in pipe.results():
        logging.debug("RGB output: " + str(output))
    logging.info("RGB - Done")

    return
    def test_two(self):
        """Producer/Consumer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        consumer = Stage(t2, 5).setup(workers=4, qsize=1000)
        pipe = producer | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        pipe = range(1000) | consumer
        res = list(t for t in pipe.results())

        self.assertEquals(max(res), 1004)
        self.assertEquals(min(res), 5)
        self.assertEquals(len(res),
                          1000)  # we are running 4 parallel producers
示例#10
0
def resample_date_conf_pipe(tiles, **kwargs):

    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]
    pipe = tiles | Stage(resample,
                         name="day_conf",
                         resample_method="near",
                         zoom=max_zoom,
                         **kwargs).setup(workers=workers)
    for i in range(max_zoom - 1, -1, -1):
        pipe = pipe | Stage(resample,
                            name="day_conf",
                            resample_method="mode",
                            zoom=i,
                            **kwargs).setup(workers=workers)

    for output in pipe.results():
        logging.debug("Resample Day Conf output: " + str(output))
    logging.info("Resample Day Conf - Done")

    return
示例#11
0
def tilecache_pipe(**kwargs):

    root = kwargs["root"]
    workers = kwargs["workers"]

    zoom_tiles = list()
    for pair in collect_rgb_tiles(root):
        zoom_tiles.append(pair)

    tile_ids = collect_rgb_tile_ids(zoom_tiles)

    pipe = (
        zoom_tiles
        | Stage(generate_vrt, kwargs["min_zoom"], kwargs["max_tilecache_zoom"],
                **kwargs).setup(workers=workers)
        | Stage(generate_tilecache_mapfile, **kwargs).setup(workers=workers)
        | Stage(generate_tilecache_config, **kwargs).setup(workers=workers)
        | Stage(generate_tile_list, tile_ids=tile_ids, **
                kwargs).setup(workers=workers)
        | Stage(save_tile_lists, **kwargs).setup(workers=workers)
        | Stage(generate_tiles, **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Tilecache output: " + str(output))

    upload_tilecache_s3(**kwargs)

    logging.info("Tilecache - Done")

    return
def test_download_preprocessed_tiles_years(mock_sp, mock_path):

    preprocessed_years = [2015, 2016, 2017]
    tile_ids = ["050W_00N_040W_10N"]
    root = "/home/thomas/projects/gfw-sync/glad_tiles_pipeline/tests/data"
    name = "download"
    year_str = "_".join(str(year) for year in preprocessed_years)

    mock_path.mkdir.return_value = "home/user/data"

    output = output_file(
        root, "tiles", tile_ids[0], "date_conf", year_str, "day_conf.tif"
    )

    mock_sp.check_call.return_value = sp.CalledProcessError

    pipe = tile_ids | Stage(
        download_preprocessed_tiles_years,
        preprocessed_years=preprocessed_years,
        root=root,
        name=name,
    )

    for x in pipe.results():
        assert x == output

    # TODO: Is there a better way to check if nothing was returned?
    mock_sp.check_call.return_value = True

    pipe = tile_ids | Stage(
        download_preprocessed_tiles_years,
        preprocessed_years=preprocessed_years,
        root=root,
        name=name,
    )

    out = False
    for x in pipe.results():
        out = True
    assert out is False
示例#13
0
def intensity_pipe(tiles, **kwargs):
    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]
    pipe = (tiles
            | Stage(unset_no_data_value).setup(workers=workers)
            | Stage(prep_intensity, name="day_conf", **
                    kwargs).setup(workers=workers)
            | Stage(resample,
                    name="intensity",
                    resample_method="near",
                    zoom=max_zoom,
                    **kwargs).setup(workers=workers))
    for i in range(max_zoom - 1, -1, -1):
        pipe = pipe | Stage(resample,
                            name="intensity",
                            resample_method="bilinear",
                            zoom=i,
                            **kwargs).setup(workers=workers)

    for output in pipe.results():
        logging.debug("Intensity output: " + str(output))
    logging.info("Intensity - Done")

    return
示例#14
0
def upload_tiles(
    tile_cache: str,
    dataset: str,
    version: str,
    bucket: str = "gfw-tiles",
    implementation: str = "default",
    cores: int = CORES,
) -> None:
    """Upload a local tile cache to S3."""

    LOGGER.info(
        f"Upload tile cache to {dataset}/{version}/{implementation} using {cores} processes"
    )

    # pipe files
    pipe = get_tiles(tile_cache) | Stage(
        copy_tiles, tile_cache, bucket, dataset, version, implementation
    ).setup(workers=cores)

    # collect results
    for output in pipe.results():
        LOGGER.debug(output)
示例#15
0
    def create_tiles(
        self, overwrite: bool
    ) -> Tuple[List[Tile], List[Tile], List[Tile], List[Tile]]:
        """Raster Pipe."""

        LOGGER.info("Start Raster Pipe")

        tiles = self.collect_tiles(overwrite=overwrite)

        GLOBALS.workers = max(self.tiles_to_process, 1)

        pipe = (
            tiles
            | Stage(self.transform).setup(workers=GLOBALS.workers)
            | self.upload_file
            | self.delete_work_dir
        )

        tiles, skipped_tiles, failed_tiles, existing_tiles = self._process_pipe(pipe)

        LOGGER.info("Finished Raster Pipe")
        return tiles, skipped_tiles, failed_tiles, existing_tiles
示例#16
0
    def test_one(self):
        """Only producer configuration"""
        producer = Stage(t1, range(1000)).setup(workers=4, qsize=10)
        res = list(t for t in producer.results())

        self.assertEquals(max(res), 999)
        self.assertEquals(min(res), 0)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        # let's reuse a pipe again
        res = list(t for t in producer.results())

        self.assertEquals(max(res), 999)
        self.assertEquals(min(res), 0)
        self.assertEquals(len(res),
                          1000 * 4)  # we are running 4 parallel producers

        # task with one result
        producer = Stage(t3, range(1000), sum).setup(workers=4, qsize=10)
        res = producer.execute()
        self.assertEquals(res, sum(range(1000)))
示例#17
0
def csv_export_pipe(**kwargs):

    root = kwargs["root"]
    years = [str(year) for year in kwargs["years"]]
    workers = kwargs["workers"]
    max_zoom = kwargs["max_zoom"]

    day_conf_tiles = get_preprocessed_tiles(root, include_years=years)

    columns_csv = [
        "lon",
        "lat",
        "confidence",
        "year",
        "julian_day",
        "area",
        "val1",
        "val2",
    ]

    header_csv = [
        "long",
        "lat",
        "confidence",
        "year",
        "julian_day",
        "area",
        "emissions",
        "climate_mask",
    ]

    columns_xyz = ["x", "y", "z", "alert_count", "alert_date", "confidence"]
    header_xyz = ["x", "y", "z", "alert_count", "alert_date", "confidence"]

    pipe = (day_conf_tiles
            | Stage(match_emissions, name="emissions", **
                    kwargs).setup(workers=workers)
            | Stage(match_climate_mask, name="climate_mask", **
                    kwargs).setup(workers=workers)
            | Stage(get_dataframe).setup(workers=workers)
            | Stage(decode_day_conf).setup(workers=workers)
            | Stage(save_csv,
                    name="output",
                    columns=columns_csv,
                    header=header_csv,
                    return_input=True,
                    **kwargs).setup(workers=workers)
            | Stage(upload_csv_s3, name="output", **
                    kwargs).setup(workers=workers)
            | Stage(convert_julian_date).setup(workers=workers)
            | Stage(convert_latlon_xyz, **kwargs).setup(workers=workers)
            | Stage(group_by_xyz).setup(workers=workers)
            | Stage(save_csv,
                    name="db/{}".format(max_zoom),
                    columns=columns_xyz,
                    header=header_xyz,
                    return_input=True,
                    **kwargs).setup(workers=workers))

    for i in range(max_zoom - 1, -1, -1):
        pipe = (pipe
                | Stage(convert_to_parent_xyz).setup(workers=workers)
                | Stage(group_by_xyz).setup(workers=workers)
                | Stage(save_csv,
                        name="db/{}".format(i),
                        columns=columns_xyz,
                        header=header_xyz,
                        return_input=True,
                        **kwargs).setup(workers=workers))

    for output in pipe.results():
        logging.debug("Export CSV output: " + str(output))
    logging.info("Export CSV - Done")
示例#18
0
def raster2df(*src_rasters,
              col_names=None,
              max_block_size=4096,
              calc_area=False,
              workers=1):
    """
    Converts raster into Panda DataFrame.
    Input rasters must match cell size and extent.
    The first raster determines number of output rows.
    Only cells which are are above given Threshold/ not NoData are processed
    The tool calculates lat lon for every grid cell and extract the cell value.
    If more than one input raster is provided tool adds additional columns to CSV with coresponing values.
    :param src_rasters: Input rasters (one or many)
    :param col_names: Column names for input raster values (optional, default: val1, val2, ...)
    :param max_block_size: maximum block size to process in at once
    :param calc_area: Calculate geodesic area
    :param workers: number of parallel workers
    :return: Pandas data frame
    """

    if col_names:
        assert len(src_rasters) == len(
            col_names
        ), "Number of named columns does not match number of input rasters. Abort."

    sources = _assert_sources(src_rasters)

    src = sources[0]
    affine = src.transform
    step_height, step_width = _get_steps(src, max_block_size)

    kwargs = {
        "col_size": affine[0],
        "row_size": affine[4],
        "step_width": step_width,
        "step_height": step_height,
        "width": src.width,
        "height": src.height,
        "calc_area": calc_area,
    }

    cols = range(0, src.width, step_width)
    rows = range(0, src.height, step_height)

    blocks = itertools.product(cols, rows)

    pipe = blocks | Stage(_process_blocks, sources, **
                          kwargs).setup(workers=workers)

    data_frame = pd.DataFrame()
    for df in pipe.results():
        if data_frame.empty:
            data_frame = df[0]  # unpack data frame from tuple
        else:
            data_frame = pd.concat([data_frame, df[0]])

    if col_names:
        i = 0
        for col_name in col_names:
            data_frame = data_frame.rename(
                index=str, columns={"val{}".format(i): col_name})
            i += 1

    for src in sources:
        src.close()

    return data_frame