Exemplo n.º 1
0
    def _get_dss(
        self,
        dc: Datacube,
        product: str,
        msg: Callable[[str], Any],
        temporal_range: Optional[DateTimeRange] = None,
        tiles: Optional[TilesRange2d] = None,
    ):
        """
        This returns a tuple containing:
        - a generator of datasets
        - the number of datasets in the generator
        - a config dictionary containing the product, temporal range, tiles, and the datacube query used
        """

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        if temporal_range is not None:
            query.update(
                temporal_range.dc_query(pad=0.6)
            )  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        return dss, n_dss, cfg
Exemplo n.º 2
0
    def __init__(self,
                 cache: Union[str, DatasetCache],
                 product: Optional[OutputProduct] = None):
        if isinstance(cache, str):
            cache = DatasetCache.open_ro(cache)

        # TODO: verify this things are set in the file
        cfg = cache.get_info_dict('stats/config')
        grid = cfg['grid']
        gridspec = cache.grids[grid]

        self._product = product
        self._dscache = cache
        self._cfg = cfg
        self._grid = grid
        self._gridspec = gridspec
        self._all_tiles = sorted(idx for idx, _ in cache.tiles(grid))
Exemplo n.º 3
0
    def __init__(self,
                 output: str,
                 grid: str,
                 frequency: str = 'annual',
                 overwrite: bool = False,
                 complevel: int = 6):

        if DatasetCache.exists(output) and overwrite is False:
            raise ValueError(f"File database already exists: {output}")

        grid, gridspec = parse_gridspec_with_name(grid)

        self._output = output
        self._overwrite = overwrite
        self._complevel = complevel
        self._grid = grid
        self._gridspec = gridspec
        self._frequency = frequency
Exemplo n.º 4
0
    def __init__(self,
                 cache: Union[str, DatasetCache],
                 product: Optional[OutputProduct] = None):
        self._cache_path = None
        if isinstance(cache, str):
            if cache.startswith("s3://"):
                self._cache_path = s3_download(cache)
                cache = self._cache_path
            cache = DatasetCache.open_ro(cache)

        # TODO: verify this things are set in the file
        cfg = cache.get_info_dict("stats/config")
        grid = cfg["grid"]
        gridspec = cache.grids[grid]

        self._product = product
        self._dscache = cache
        self._cfg = cfg
        self._grid = grid
        self._gridspec = gridspec
        self._all_tiles = sorted(idx for idx, _ in cache.tiles(grid))
Exemplo n.º 5
0
    def save(
        self,
        dc: Datacube,
        product: str,
        temporal_range: Union[str, DateTimeRange, None] = None,
        tiles: Optional[TilesRange2d] = None,
        msg: Optional[Callable[[str], Any]] = None,
        debug: bool = False,
    ) -> bool:

        dt_range = SimpleNamespace(start=None, end=None)

        def _update_start_end(x, out):
            if out.start is None:
                out.start = x
                out.end = x
            else:
                out.start = min(out.start, x)
                out.end = max(out.end, x)

        def persist(ds: Dataset) -> CompressedDataset:
            _ds = compress_ds(ds)
            _update_start_end(_ds.time, dt_range)
            return _ds

        def msg_default(msg):
            pass

        if msg is None:
            msg = msg_default

        if isinstance(temporal_range, str):
            temporal_range = DateTimeRange(temporal_range)

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        # TODO: properly handle UTC offset when limiting query to a given time temporal_range
        #       Basically need to pad query by 12hours, then trim datasets post-query
        if temporal_range is not None:
            query.update(temporal_range.dc_query(
                pad=0.6))  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        msg("Training compression dictionary")
        zdict = dictionary_from_product_list(dc, [product],
                                             samples_per_product=100)
        msg(".. done")

        cache = DatasetCache.create(
            self._output,
            zdict=zdict,
            complevel=self._complevel,
            truncate=self._overwrite,
        )
        cache.add_grid(self._gridspec, self._grid)
        cache.append_info_dict("stats/", dict(config=cfg))

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        dss = cache.tee(dss)
        dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist)
        dss = tqdm(dss, total=n_dss)

        rr = ds_stream_test_func(dss)
        msg(rr.text)

        if tiles is not None:
            # prune out tiles that were not requested
            cells = {
                tidx: dss
                for tidx, dss in cells.items() if is_tile_in(tidx, tiles)
            }

        n_tiles = len(cells)
        msg(f"Total of {n_tiles:,d} spatial tiles")

        if self._frequency == "all":
            tasks = bin_full_history(cells,
                                     start=dt_range.start,
                                     end=dt_range.end)
        elif self._frequency == "seasonal":
            tasks = bin_seasonal(cells, months=3, anchor=12)
        elif temporal_range is not None:
            tasks = bin_generic(cells, [temporal_range])
        else:
            tasks = bin_annual(cells)

        tasks_uuid = {k: [ds.id for ds in dss] for k, dss in tasks.items()}

        msg(f"Saving tasks to disk ({len(tasks)})")
        cache.add_grid_tiles(self._grid, tasks_uuid)
        msg(".. done")

        csv_path = self.out_path(".csv")
        msg(f"Writing summary to {csv_path}")
        with open(csv_path, "wt") as f:
            f.write('"T","X","Y","datasets","days"\n')

            for p, x, y in sorted(tasks):
                dss = tasks[(p, x, y)]
                n_dss = len(dss)
                n_days = len(set(ds.time.date() for ds in dss))
                line = f'"{p}", {x:+05d}, {y:+05d}, {n_dss:4d}, {n_days:4d}\n'
                f.write(line)

        msg("Dumping GeoJSON(s)")
        grid_info = compute_grid_info(
            cells, resolution=max(self._gridspec.tile_size) / 4)
        tasks_geo = gjson_from_tasks(tasks, grid_info)
        for temporal_range, gjson in tasks_geo.items():
            fname = self.out_path(f"-{temporal_range}.geojson")
            msg(f"..writing to {fname}")
            with open(fname, "wt") as f:
                json.dump(gjson, f)

        if debug:
            pkl_path = self.out_path("-cells.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(cells, fb)

            pkl_path = self.out_path("-tasks.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(tasks, fb)

        return True
Exemplo n.º 6
0
    def save(
        self,
        dc: Datacube,
        product: str,
        temporal_range: Union[str, DateTimeRange, None] = None,
        tiles: Optional[TilesRange2d] = None,
        predicate: Optional[Callable[[Dataset], bool]] = None,
        msg: Optional[Callable[[str], Any]] = None,
        debug: bool = False,
        dss=None,
        n_dss=None,
    ) -> bool:
        """
        :param product: Product name to consume
        :param temporal_range: Optionally  limit query in time
        :param tiles: Optionally limit query to a range of tiles
        :param predicate: If supplied filter Datasets as they come in with custom filter, Dataset->Bool
        :param msg: Observe messages if needed via callback
        :param debug: Dump some intermediate state to files for debugging
        :param dss: A generator of datasets to use
        :param n_dss: The number of datasets in the generator
        """

        dt_range = SimpleNamespace(start=None, end=None)

        def _update_start_end(x, out):
            if out.start is None:
                out.start = x
                out.end = x
            else:
                out.start = min(out.start, x)
                out.end = max(out.end, x)

        def persist(ds: Dataset) -> CompressedDataset:
            _ds = compress_ds(ds)
            _update_start_end(_ds.time, dt_range)
            return _ds

        def msg_default(msg):
            pass

        if msg is None:
            msg = msg_default

        if isinstance(temporal_range, str):
            temporal_range = DateTimeRange(temporal_range)

        if dss is None:
            dss, n_dss, cfg = self._get_dss(dc, product, msg, temporal_range, tiles)
        else:

            cfg: Dict[str, Any] = dict(
                grid=self._grid,
                freq=self._frequency,
            )

            if temporal_range is not None:
                cfg["temporal_range"] = temporal_range.short

            if tiles is not None:
                cfg["tiles"] = tiles

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg(f"Processing {n_dss:,d} datasets")

        msg("Training compression dictionary")
        dss_slice = list(islice(dss, 0, 100))
        samples = dss_slice.copy()
        random.shuffle(samples)
        zdict = DatasetCache.train_dictionary(samples, 8 * 1024)
        dss = chain(dss_slice, dss)
        msg(".. done")

        cache = DatasetCache.create(
            self._output,
            zdict=zdict,
            complevel=self._complevel,
            truncate=self._overwrite,
        )
        cache.add_grid(self._gridspec, self._grid)
        cache.append_info_dict("stats/", dict(config=cfg))

        cells: Dict[Tuple[int, int], Any] = {}

        if predicate is not None:
            dss = filter(predicate, dss)
        dss = cache.tee(dss)
        dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist)
        dss = tqdm(dss, total=n_dss)

        rr = ds_stream_test_func(dss)
        msg(rr.text)

        if tiles is not None:
            # prune out tiles that were not requested
            cells = {
                tidx: cell for tidx, cell in cells.items() if is_tile_in(tidx, tiles)
            }

        if temporal_range is not None:
            # Prune Datasets outside of temporal range (after correcting for UTC offset)
            for cell in cells.values():
                utc_offset = cell.utc_offset
                cell.dss = [
                    ds for ds in cell.dss if (ds.time + utc_offset) in temporal_range
                ]

        n_tiles = len(cells)
        msg(f"Total of {n_tiles:,d} spatial tiles")

        if self._frequency == "all":
            tasks = bin_full_history(cells, start=dt_range.start, end=dt_range.end)
        elif self._frequency == "semiannual":
            tasks = bin_seasonal(cells, months=6, anchor=1)
        elif self._frequency == "seasonal":
            tasks = bin_seasonal(cells, months=3, anchor=12)
        elif self._frequency == "annual-fy":
            tasks = bin_seasonal(cells, months=12, anchor=7)
        elif self._frequency == "annual":
            tasks = bin_annual(cells)
        elif temporal_range is not None:
            tasks = bin_generic(cells, [temporal_range])
        else:
            tasks = bin_annual(cells)

        tasks_uuid = {k: [ds.id for ds in dss] for k, dss in tasks.items()}

        msg(f"Saving tasks to disk ({len(tasks)})")
        cache.add_grid_tiles(self._grid, tasks_uuid)
        msg(".. done")

        self._write_info(tasks, msg, cells, debug)

        return True