예제 #1
0
    def _get_dss(
        self,
        dc: Datacube,
        product: str,
        msg: Callable[[str], Any],
        temporal_range: Optional[DateTimeRange] = None,
        tiles: Optional[TilesRange2d] = None,
    ):
        """
        This returns a tuple containing:
        - a generator of datasets
        - the number of datasets in the generator
        - a config dictionary containing the product, temporal range, tiles, and the datacube query used
        """

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        if temporal_range is not None:
            query.update(
                temporal_range.dc_query(pad=0.6)
            )  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        return dss, n_dss, cfg
예제 #2
0
def collect_uris(prod_index, products, expressions):
    """
    Collect all URIs of datasets from products
    matching search expressions.
    """
    dc = Datacube(index=prod_index)
    for prod in products:
        for dataset in dc.find_datasets_lazy(product=prod, **expressions):
            yield normalize_uri(dataset.local_uri)
예제 #3
0
def chopped_dss(dc: Datacube, freq: str = "m", **query):
    """Emulate streaming interface for datacube queries.

    Basic idea is to perform a lot of smaller queries (shorter time
    periods)
    """
    qq = Query(**query)

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets_lazy(**q.search_terms)
        yield from dss
예제 #4
0
    def save(
        self,
        dc: Datacube,
        product: str,
        temporal_range: Union[str, DateTimeRange, None] = None,
        tiles: Optional[TilesRange2d] = None,
        msg: Optional[Callable[[str], Any]] = None,
        debug: bool = False,
    ) -> bool:

        dt_range = SimpleNamespace(start=None, end=None)

        def _update_start_end(x, out):
            if out.start is None:
                out.start = x
                out.end = x
            else:
                out.start = min(out.start, x)
                out.end = max(out.end, x)

        def persist(ds: Dataset) -> CompressedDataset:
            _ds = compress_ds(ds)
            _update_start_end(_ds.time, dt_range)
            return _ds

        def msg_default(msg):
            pass

        if msg is None:
            msg = msg_default

        if isinstance(temporal_range, str):
            temporal_range = DateTimeRange(temporal_range)

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        # TODO: properly handle UTC offset when limiting query to a given time temporal_range
        #       Basically need to pad query by 12hours, then trim datasets post-query
        if temporal_range is not None:
            query.update(temporal_range.dc_query(
                pad=0.6))  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        msg("Training compression dictionary")
        zdict = dictionary_from_product_list(dc, [product],
                                             samples_per_product=100)
        msg(".. done")

        cache = DatasetCache.create(
            self._output,
            zdict=zdict,
            complevel=self._complevel,
            truncate=self._overwrite,
        )
        cache.add_grid(self._gridspec, self._grid)
        cache.append_info_dict("stats/", dict(config=cfg))

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        dss = cache.tee(dss)
        dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist)
        dss = tqdm(dss, total=n_dss)

        rr = ds_stream_test_func(dss)
        msg(rr.text)

        if tiles is not None:
            # prune out tiles that were not requested
            cells = {
                tidx: dss
                for tidx, dss in cells.items() if is_tile_in(tidx, tiles)
            }

        n_tiles = len(cells)
        msg(f"Total of {n_tiles:,d} spatial tiles")

        if self._frequency == "all":
            tasks = bin_full_history(cells,
                                     start=dt_range.start,
                                     end=dt_range.end)
        elif self._frequency == "seasonal":
            tasks = bin_seasonal(cells, months=3, anchor=12)
        elif temporal_range is not None:
            tasks = bin_generic(cells, [temporal_range])
        else:
            tasks = bin_annual(cells)

        tasks_uuid = {k: [ds.id for ds in dss] for k, dss in tasks.items()}

        msg(f"Saving tasks to disk ({len(tasks)})")
        cache.add_grid_tiles(self._grid, tasks_uuid)
        msg(".. done")

        csv_path = self.out_path(".csv")
        msg(f"Writing summary to {csv_path}")
        with open(csv_path, "wt") as f:
            f.write('"T","X","Y","datasets","days"\n')

            for p, x, y in sorted(tasks):
                dss = tasks[(p, x, y)]
                n_dss = len(dss)
                n_days = len(set(ds.time.date() for ds in dss))
                line = f'"{p}", {x:+05d}, {y:+05d}, {n_dss:4d}, {n_days:4d}\n'
                f.write(line)

        msg("Dumping GeoJSON(s)")
        grid_info = compute_grid_info(
            cells, resolution=max(self._gridspec.tile_size) / 4)
        tasks_geo = gjson_from_tasks(tasks, grid_info)
        for temporal_range, gjson in tasks_geo.items():
            fname = self.out_path(f"-{temporal_range}.geojson")
            msg(f"..writing to {fname}")
            with open(fname, "wt") as f:
                json.dump(gjson, f)

        if debug:
            pkl_path = self.out_path("-cells.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(cells, fb)

            pkl_path = self.out_path("-tasks.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(tasks, fb)

        return True