Пример #1
0
    def __init__(self, layout, crs=None, extent=None, cellsize=None, dimensions=None):
        self.__jvm = gps.get_spark_context()._gateway.jvm

        if isinstance(layout, gps.LocalLayout):
            if not extent:
                raise ValueError("Must specify an extent when using LocalLayout")

            if dimensions and not cellsize:
                cellsize = ((extent.xmax - extent.xmin)/dimensions[0], (extent.ymax - extent.ymin)/dimensions[1])
                dimensions = None

            if cellsize and not dimensions:
                tilewidth = layout.tile_cols * cellsize[0]
                tileheight = layout.tile_rows * cellsize[1]
                rows = ceil((extent.xmax - extent.xmin) / tilewidth)
                cols = ceil((extent.ymax - extent.ymin) / tileheight)
                extent = gps.Extent(extent.xmin, extent.ymax - rows * tileheight, extent.xmin + cols * tilewidth, extent.ymax)
                tl = gps.TileLayout(cols, rows, layout.tile_cols, layout.tile_rows)
            else:
                raise ValueError("For LocalLayout, must specify exactly one: cellsize or dimension")
        elif isinstance(layout, gps.GlobalLayout):
            try:
                from pyproj import Proj, transform
            except:
                raise ImportError('pyproj is required for GlobalLayout')

            if not layout.zoom:
                raise ValueError("Must specify a zoom level when using GlobalLayout")

            if not crs:
                raise ValueError("Must specify a crs when using GlobalLayout")

            if isinstance(crs, int):
                crs = "{}".format(crs)

            gtcrs = self.__jvm.geopyspark.geotrellis.TileLayer.getCRS(crs).get()

            if gtcrs.epsgCode().isDefined() and gtcrs.epsgCode().get() == 3857:
                extent = WEB_MERCATOR
            elif gtcrs.epsgCode().isDefined() and gtcrs.epsgCode().get() == 4326:
                extent = LATLNG
            else:
                llex = LATLNG
                proj4str = gtcrs.toProj4String()
                target = Proj(proj4str)
                xmin, ymin = target(llex.xmin, llex.ymin)
                xmax, ymax = target(llex.xmax, llex.ymax)
                extent = gps.Extent(xmin, ymin, xmax, ymax)

            layout_rows_cols = int(pow(2, layout.zoom))
            tl = gps.TileLayout(layout_rows_cols, layout_rows_cols, layout.tile_size, layout.tile_size)
        elif isinstance(layout, gps.LayoutDefinition):
            extent = layout.extent
            tl = layout.tileLayout

        ex = self.__jvm.geotrellis.vector.Extent(float(extent.xmin), float(extent.ymin), float(extent.xmax), float(extent.ymax))
        tilelayout = self.__jvm.geotrellis.raster.TileLayout(int(tl[0]), int(tl[1]), int(tl[2]), int(tl[3]))
        self.layout = gps.LayoutDefinition(extent, tl)
        self.__layout = self.__jvm.geotrellis.spark.tiling.LayoutDefinition(ex, tilelayout)
Пример #2
0
class KeyTransformTest(BaseTestClass):

    layout = gps.LayoutDefinition(gps.Extent(0, 0, 1, 1),
                                  gps.TileLayout(5, 5, 2, 2))

    def test_key_to_extent(self):
        kt_layout = gps.KeyTransform(self.layout)
        self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0),
                         kt_layout.key_to_extent(gps.SpatialKey(0, 0)))

        kt_local = gps.KeyTransform(gps.LocalLayout(2),
                                    extent=gps.Extent(0, 0, 1, 1),
                                    dimensions=(10, 10))
        self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0),
                         kt_layout.key_to_extent(gps.SpatialKey(0, 0)))

        kt_global = gps.KeyTransform(gps.GlobalLayout(zoom=1), crs=4326)
        nw_global_extent = kt_global.key_to_extent(gps.SpatialKey(0, 0))
        self.assertTrue(
            abs(nw_global_extent.xmin + 180.0) <= 1e-4
            and abs(nw_global_extent.xmax) <= 1e-4
            and abs(nw_global_extent.ymin) <= 1e-4
            and abs(nw_global_extent.ymax - 90) <= 1e-4)

    def test_extent_to_key(self):
        kt = gps.KeyTransform(self.layout)
        self.assertTrue(
            set(kt.extent_to_keys(gps.Extent(0, 0, 0.4, 0.4))) == set(
                [gps.SpatialKey(x, y) for x in [0, 1] for y in [3, 4]]))

    def test_geom_to_key(self):
        kt = gps.KeyTransform(self.layout)
        self.assertTrue(
            kt.geometry_to_keys(Point(0.1, 0.1)) == [gps.SpatialKey(0, 4)])
Пример #3
0
    def windows(line, ws):
        for w in ws:
            ((row_start, row_stop), (col_start, col_stop)) = w
            area_of_interest = box(-122.47678756713866, 37.80924146650164,
                                   -122.46288299560545, 37.80490143094975)
            new_line['projected_extent'] = gps.TemporalProjectedExtent(
                extent=extent,
                instant=instant,
                proj4=proj4,
                geometries=area_of_interest)
            left = bounds.left + (bounds.right -
                                  bounds.left) * (float(col_start) / width)
            right = bounds.left + (bounds.right -
                                   bounds.left) * (float(col_stop) / width)
            bottom = bounds.top + (bounds.bottom -
                                   bounds.top) * (float(row_stop) / height)
            top = bounds.top + (bounds.bottom -
                                bounds.top) * (float(row_start) / height)
            extent = gps.Extent(left, bottom, right, top)
            instant = datetime.strptime(line['date'], '%Y%j')

            new_line = line.copy()
            new_line.pop('date')
            new_line.pop('scene_id')
            new_line['window'] = w
            yield new_line
Пример #4
0
def extent_for_cell(layout, cell):
    """
    Compute the geodetic extent of a specific tile in a layout

    Args:
        layout (``gps.LayoutDefinition``)
        cell (``gps.SpatialKey`` or ``gps.SpaceTimeKey``)
    Returns:
        ``gps.Extent``
    """
    if isinstance(cell, (gps.SpatialKey, gps.SpaceTimeKey)):
        col = cell.col
        row = cell.row
    elif isinstance(cell, tuple):
        col = cell[0]
        row = cell[1]
    else:
        raise TypeError(
            "extent_for_cell() expects SpatialKey, SpaceTimeKey, or tuple")

    w = (layout.extent.xmax -
         layout.extent.xmin) / layout.tileLayout.layoutCols
    h = (layout.extent.ymax -
         layout.extent.ymin) / layout.tileLayout.layoutRows
    x0 = layout.extent.xmin + col * w
    y0 = layout.extent.ymax - (row + 1) * h

    return gps.Extent(x0, y0, x0 + w, y0 + h)
Пример #5
0
def _read_windows(uri, xcols, ycols, bands, crs_to_proj4):

    if ("GDAL_DATA" not in os.environ) and (_GDAL_DATA is not None):
        os.environ["GDAL_DATA"] = _GDAL_DATA

    with rasterio.open(uri) as dataset:
        bounds = dataset.bounds
        height = dataset.height
        width = dataset.width
        proj4 = crs_to_proj4(dataset.get_crs())
        nodata = dataset.nodata
        tile_cols = (int)(math.ceil(width / xcols)) * xcols
        tile_rows = (int)(math.ceil(height / ycols)) * ycols
        windows = [((x, min(width - 1,
                            x + xcols)), (y, min(height - 1, y + ycols)))
                   for x in range(0, tile_cols, xcols)
                   for y in range(0, tile_rows, ycols)]

        for window in windows:
            ((row_start, row_stop), (col_start, col_stop)) = window

            left = bounds.left + (bounds.right -
                                  bounds.left) * (float(col_start) / width)
            right = bounds.left + (bounds.right -
                                   bounds.left) * (float(col_stop) / width)
            bottom = bounds.top + (bounds.bottom -
                                   bounds.top) * (float(row_stop) / height)
            top = bounds.top + (bounds.bottom -
                                bounds.top) * (float(row_start) / height)
            extent = gps.Extent(left, bottom, right, top)
            projected_extent = gps.ProjectedExtent(extent=extent, proj4=proj4)

            data = dataset.read(bands, window=window)
            tile = gps.Tile.from_numpy_array(data, no_data_value=nodata)
            yield (projected_extent, tile)
Пример #6
0
def get_raster_layer(sc, path):
    jp2s = ["B02.jp2", "B03.jp2", "B04.jp2"]
    arrs = []
    for jp2 in jp2s:
        with rasterio.open(path + jp2) as f:
            arrs.append(f.read(1))

    data = np.array(arrs, dtype=arrs[0].dtype)

    # Create an Extent instance from rasterio's bounds
    extent = gps.Extent(*f.bounds)

    # The EPSG code can also be obtained from the information read in via rasterio
    projected_extent = gps.ProjectedExtent(extent=extent,
                                           epsg=int(
                                               f.crs.to_dict()['init'][5:]))

    # We can create a Tile instance from our multiband, raster array and the nodata value from rasterio
    tile = gps.Tile.from_numpy_array(numpy_array=data, no_data_value=f.nodata)

    # Now that we have our ProjectedExtent and Tile, we can create our RDD from them
    rdd = sc.parallelize([(projected_extent, tile)])

    # While there is a time component to the data, this was ignored for this tutorial and
    # instead the focus is just on the spatial information. Thus, we have a LayerType of SPATIAL.
    raster_layer = gps.RasterLayer.from_numpy_rdd(
        layer_type=gps.LayerType.SPATIAL, numpy_rdd=rdd)

    return raster_layer
Пример #7
0
    def test_key_to_extent(self):
        kt_layout = gps.KeyTransform(self.layout)
        self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0),
                         kt_layout.key_to_extent(gps.SpatialKey(0, 0)))

        kt_local = gps.KeyTransform(gps.LocalLayout(2),
                                    extent=gps.Extent(0, 0, 1, 1),
                                    dimensions=(10, 10))
        self.assertEqual(gps.Extent(0.0, 0.8, 0.2, 1.0),
                         kt_layout.key_to_extent(gps.SpatialKey(0, 0)))

        kt_global = gps.KeyTransform(gps.GlobalLayout(zoom=1), crs=4326)
        nw_global_extent = kt_global.key_to_extent(gps.SpatialKey(0, 0))
        self.assertTrue(
            abs(nw_global_extent.xmin + 180.0) <= 1e-4
            and abs(nw_global_extent.xmax) <= 1e-4
            and abs(nw_global_extent.ymin) <= 1e-4
            and abs(nw_global_extent.ymax - 90) <= 1e-4)
Пример #8
0
def get_slice_indexes_and_extent(nc_file, geojson_shape):
    """
    Calculates x/y slice indexes in the nc file for the given shape.
    :param nc_file: NetCDF File
    :param geojson_shape: Requested shape
    :return: x/y-indexes of shape bounding box, geopyspark extent of bounding box,
                geojson features as polygons in x/y coordinates
    """
    lat_array = nc_file['lat'][:]
    lon_array = nc_file['lon'][:]

    # Transform the geojson into shapes. We need the shapes represented both as
    # indices into the lat-/lon-arrays (to read only the required slices from NetCDF)
    # and as x-/y-values (to mask the constructed layout).
    x_coords = nc_file['rlon'][:]
    y_coords = nc_file['rlat'][:]
    mask_shapes_indices = []
    mask_shapes_xy = []
    for feature in geojson_shape:
        # Get each vertex's index in the lat- and lon-arrays
        vertex_indices = np.array(
            list(
                get_indexes(lat_array, lon_array, lon_array.shape, vertex[1],
                            vertex[0])
                for vertex in feature['geometry']['coordinates'][0]))
        mask_shapes_indices.append(vertex_indices)

        # Get the corresponding x and y values
        vertex_xs = x_coords[np.array(vertex_indices)[:, 1]]
        vertex_ys = y_coords[np.array(vertex_indices)[:, 0]]

        # Transform into a polygon
        polygon = Polygon(zip(vertex_xs, vertex_ys))
        mask_shapes_xy.append(polygon)

    # Get the slices to read from NetCDF
    y_slice_start = int(min(s[:, 0].min() for s in mask_shapes_indices))
    x_slice_start = int(min(s[:, 1].min() for s in mask_shapes_indices))
    y_slice_stop = int(max(s[:, 0].max() for s in mask_shapes_indices))
    x_slice_stop = int(max(s[:, 1].max() for s in mask_shapes_indices))

    x_min = float(min(s.bounds[0] for s in mask_shapes_xy))
    y_min = float(min(s.bounds[1] for s in mask_shapes_xy))
    x_max = float(max(s.bounds[2] for s in mask_shapes_xy))
    y_max = float(max(s.bounds[3] for s in mask_shapes_xy))
    extent = gps.Extent(x_min, y_min, x_max, y_max)

    return x_slice_start, x_slice_stop, y_slice_start, y_slice_stop, extent, mask_shapes_xy
Пример #9
0
def map_ndvi(M, img, bounds, crs):
    # Start a spark context if needed
    init_sc()
    # Color ramp for NDVI
    ndvi_breaks_dict = {
        0.05: 0xffffe5aa,
        0.1: 0xf7fcb9ff,
        0.2: 0xd9f0a3ff,
        0.3: 0xaddd8eff,
        0.4: 0x78c679ff,
        0.5: 0x41ab5dff,
        0.6: 0x238443ff,
        0.7: 0x006837ff,
        1.0: 0x004529ff
    }
    ndvi_color_map = gps.ColorMap.from_break_map(ndvi_breaks_dict)

    # Convert the CRS into a proj4 string
    srs = osr.SpatialReference()
    srs.ImportFromWkt(crs.wkt)
    proj4 = srs.ExportToProj4()

    # Create the projected extent
    projected_extent = gps.ProjectedExtent(gps.Extent(bounds.left,
                                                      bounds.bottom,
                                                      bounds.right,
                                                      bounds.top),
                                           proj4=proj4)

    tiles = sc.parallelize([(projected_extent,
                             gps.Tile.from_numpy_array(img,
                                                       no_data_value=0.0))])
    raster_layer = gps.geotrellis.RasterLayer.from_numpy_rdd(
        gps.LayerType.SPATIAL, tiles)
    tiled_raster_layer = raster_layer.tile_to_layout(
        gps.GlobalLayout(),
        target_crs=3857,
        partition_strategy=gps.HashPartitionStrategy(40))
    pyramid = tiled_raster_layer.pyramid(
        resample_method=gps.ResampleMethod.BILINEAR)

    tms = gps.TMS.build(pyramid, ndvi_color_map)
    M.add_layer(TMSRasterData(tms), name="ndvi")
Пример #10
0
    def key_to_extent(self, key, *args):
        """Returns the Extent corresponding to a given key.

        Args:
            key (:class:`~geopyspark.geotrellis.SpatialKey` or :class:`~geopyspark.geotrellis.SpaceTimeKey` or int): The
                key to find the extent for.  If of type int, then this parameter is the column of the key, and the call
                must provide a single additional int value in the args parameter to serve as the row of the key.

        Returns:
            :class:`~geopyspark.geotrellis.Extent`
        """
        if isinstance(key, (gps.SpatialKey, gps.SpaceTimeKey)):
            skey = self.__jvm.geotrellis.spark.SpatialKey(key.col, key.row)
        elif isinstance(key, tuple):
            skey = self.__jvm.geotrellis.spark.SpatialKey(key[0], key[1])
        elif isinstance(key, int) and len(args) == 1 and isinstance(args[0], int):
            skey = self.__jvm.geotrellis.spark.SpatialKey(key, args[0])
        else:
            raise ValueError("Please supply either gps.SpatialKey, gps.SpaceTimeKey, (int, int), or two ints")
        ex = self.__layout.mapTransform().apply(skey)
        return gps.Extent(ex.xmin(), ex.ymin(), ex.xmax(), ex.ymax())
Пример #11
0
def buffered_cell_extent(layout, px_buffer, cell):
    """
    Compute the extent of a cell in a layout with a buffer

    This funtion computes the extent of a cell and adds ``px_buffer`` worth of area
    on all sides.  That is, if the tile dimension in a given layout is n x n pixels,
    then this function returns the extent for a ``(n + 2 * px_buffer)`` square region
    centered on the given cell.

    Args:
        layout (``gps.LayoutDefinition``)
        px_buffer (int): number of pixels to pad the border of the extent with
        cell (``gps.SpatialKey`` or ``gps.SpaceTimeKey``): identifier of the desired
            layout cell

    Returns:
        ``gps.Extent``
    """
    ex = extent_for_cell(layout, cell)
    cx, cy = cell_size(layout)
    return gps.Extent(ex.xmin - cx * px_buffer, ex.ymin - cy * px_buffer,
                      ex.xmax + cx * px_buffer, ex.ymax + cy * px_buffer)
Пример #12
0
 def convert_extent(extent_sc: JavaObject) -> geopyspark.Extent:
     return geopyspark.Extent(extent_sc.xmin(), extent_sc.ymin(),
                              extent_sc.xmax(), extent_sc.ymax())
Пример #13
0
 def test_extent_to_key(self):
     kt = gps.KeyTransform(self.layout)
     self.assertTrue(
         set(kt.extent_to_keys(gps.Extent(0, 0, 0.4, 0.4))) == set(
             [gps.SpatialKey(x, y) for x in [0, 1] for y in [3, 4]]))
Пример #14
0
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog,
            probability_images, seed, config_filename):
    """The primary script

    Args:
        spark (``pyspark.sql.SparkSession``)
        logger (``py4j.JavaObject``)
        s3_bucket (str): Name of the S3 bucket to search for configuration objects
            and save results to
        run_id (str): The identifier of the current run
        aoi_id (str): The identifier for the current area of interest
        probability_images (int): The number of tiles to save the generated
            probability images for
        seed (int): A random seed used to sample the probability images, for
            reproducability

    Required external inputs:
        <s3_bucket>/cvmapper_config.yaml
            under ``learner`` key:
                    prefix: The S3 prefix under which CSVs can be read and written
                    pool: Name of CSV file under s3_bucket/prefix giving the
                        comprehensive list of active grid cells
                    incoming_names: Name of CSV file under s3_bucket/prefix giving
                        list of cells used for training/validation
                    image_catalog: Name of CSV file under s3_bucket giving catalog
                        of imagery
                    image_output_pattern: URI pattern used for output of probability
                        images.  Must contain two '{}' tokens to be replaced by the
                        column and row for the relevant cell
                    outgoing: S3 URI to save the CSV of worst-performing cells to

        location pool:
            A CSV of ``name``, ``col``, ``row`` for each grid cell under
            consideration.  Identified by ``pool`` parameter above.

        incoming names:
            CSV containing (at least) ``name``, ``iteration``, and ``usage``
            columns.  Every name in this file must also be contained in the image
            pool.  Location of this file given in YAML file.

        image catalog:
            A CSV minimally containing ``col``, ``row``, ``season``, and ``uri``
            columns.  Season is either 'GS' or 'OS'.  Every grid cell in the
            location pool must be contained here, and must have an entry for both
            seasons.  URI points to TIFF that completely covers listed cell with
            valid image data (no NODATA values).

    Note:

        Grid cells are defined according to the master_layout object, which
        specifies a rectangular extent in long/lat coords.  This extent is
        subdivided into cells (in this case, 13792 columns and 14477 rows).
        Each cell is then given a pixel resolution (in this case 200x200, but
        whatever is chosen must match the resolution of the label images
        provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif``
        files identified by the incoming names CSV).  When we refer to tiles,
        we mean image chips of the stated resolution, indexed by
        ``gps.SpatialKey`` objects.  The key is a col/row pair where row=0,
        col=0 corresponds to the chip in the upper left corner of the bounding
        extent.

    Note:

        Grid cell names for the output probability images
        (`image_output_pattern`) are relative to a different, coarser layout.
        These grid cell ids need not be clearly defined, since the output of
        this process is simply a bucket of COGs for display using another
        tool.  However, see the `coarse_layout` definition below for specific
        details of the layout.

    """
    params = parse_yaml_from_s3(s3_bucket, config_filename)['learner']
    label_path = parse_yaml_from_s3(
        s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1]
    s3_prefix = params['prefix']
    s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix

    catalog_prefix = params['image_catalog']
    catalog_prefix_fix = params['image_catalog_fix']

    feature_names = functools.reduce(lambda a, b: a + b, [[
        "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n),
        "{}_std_{}".format(season, n)
    ] for season in ["GS", "OS"] for n in range(1, 5)])

    master_layout = gps.LayoutDefinition(
        gps.Extent(-17.541, -35.46, 51.459, 37.54),
        gps.TileLayout(13800, 14600, 200, 200))
    master_metadata = gps.Metadata(
        gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)),
        "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8,
        master_layout.extent, master_layout)

    ####################################
    logger.warn("Reading source tables")

    checkpoint = time.time()
    f_pool = spark\
         .read\
         .option('inferScheme', True)\
         .option('header', True)\
         .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\
         .repartition('col', 'row')

    qs_in = spark \
        .read \
        .option('inferScheme', True) \
        .option('header', True) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \
        .repartition('col', 'row')

    incoming = spark.read\
                    .option('header', True)\
                    .schema(StructType([
                        StructField('name', StringType()),
                        StructField('run', IntegerType()),
                        StructField('iteration', IntegerType()),
                        StructField('processed', BooleanType()),
                        StructField('usage', StringType()),
                        StructField('label', StringType())
                    ]))\
                    .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names']))

    # merge incoming_names and incoming_names_static
    incoming = incoming.union(spark.read \
        .option('header', True) \
        .schema(StructType([
        StructField('name', StringType()),
        StructField('run', IntegerType()),
        StructField('iteration', IntegerType()),
        StructField('processed', BooleanType()),
        StructField('usage', StringType()),
        StructField('label', StringType())
    ])) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static'])))

    incoming = incoming.filter(incoming['run'] == params['runid']).filter(
        incoming['label'] == True)
    test_names = f_pool.join(incoming.select('name'), 'name',
                             'left_anti').withColumn("usage", lit("test"))
    all_names = f_pool.join(incoming.select('name', 'usage'),
                            f_pool.name == incoming.name,
                            how='left')\
                      .select(f_pool.name.alias('name'), 'col', 'row', 'usage')
    num_test_images = test_names.count()

    image_catalog = spark.read\
                          .option('inferScheme', True)\
                          .option('header', True)\
                          .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\
                          .repartition('col', 'row')
    all_image_uris = image_catalog\
                     .filter(image_catalog['season'] == 'GS')\
                     .alias('gs')\
                     .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'),
                           (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\
                     .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))
    logger.warn(
        "Elapsed time for reading source tables: {}s".format(time.time() -
                                                             checkpoint))
    ####################################
    logger.warn("Reading training labels & building training features")

    checkpoint = time.time()
    training_data = gather_data(all_image_uris,
                                all_names.filter(all_names.usage == 'train'),
                                master_metadata,
                                feature_names,
                                s3_bucket,
                                label_path,
                                include_masks=True)
    training_data.show()
    logger.warn(
        "Elapsed time for reading training labels and feature building: {}s".
        format(time.time() - checkpoint))

    ####################################
    logger.warn("Balancing data")

    checkpoint = time.time()
    balanced_data = balance_samples(spark, training_data, 'mask')
    balanced_data.show()
    logger.warn("Elapsed time for balancing data: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    logger.warn("Training model")

    checkpoint = time.time()
    pipeline = ml_pipeline(feature_names, 'mask')
    model = pipeline.fit(balanced_data)
    print(model)
    logger.warn("Elapsed time for training the model: {}s".format(time.time() -
                                                                  checkpoint))

    ####################################
    logger.warn("Validating model results")

    checkpoint = time.time()
    validation_data = gather_data(
        all_image_uris,
        all_names.filter(all_names.usage == 'validate'),
        master_metadata,
        feature_names,
        s3_bucket,
        label_path,
        include_masks=True)

    valid_fit = model.transform(validation_data).select(
        'prediction', 'probability', 'mask')

    metrics = MulticlassMetrics(
        valid_fit.rdd.map(lambda r: (r.prediction, r.mask)))
    confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist(
    )  #left to right, top to bottom
    tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \
          1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1
    binmetrics = BinaryClassificationMetrics(
        valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask'])))

    last_iteration = incoming.agg(F.max('iteration')).collect()[0][0]
    report = pd.DataFrame({
        'run': [run_id],
        'iteration': [last_iteration + 1],
        'tss': [tss],
        'accuracy': [metrics.accuracy],
        'precision': [metrics.precision(1.0)],
        'recall': [metrics.recall(1.0)],
        'fpr': [metrics.falsePositiveRate(1.0)],
        'tpr': [metrics.truePositiveRate(1.0)],
        'AUC': [binmetrics.areaUnderROC],
        'aoi': [aoi_name],
        'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')]
    })
    # TODO: allow target location to be derived from params (local or s3)
    # added because of an error where incoming_metrics.csv contained different iteration number (10)
    # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv
    # print("############Old Iteration Metrics  to overwrite###########")
    # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics']))
    # print(incoming_previous.to_string())
    # print("############New Iteration Metrics to use to overwrite###########")
    # print(report.to_string())
    pd_df_to_s3_csv(report, s3_bucket,
                    os.path.join(s3_prefix, params['metrics']))
    logger.warn(
        "Elapsed time for validating and saving metrics to s3: {}s".format(
            time.time() - checkpoint))

    ####################################
    logger.warn("Classifying test data")

    checkpoint = time.time()
    filtered_names = test_names.filter(test_names.usage == "test")
    # filtered_names.cache()
    # filtered_names.show()
    test_features = gather_data(all_image_uris, filtered_names,
                                master_metadata, feature_names, s3_bucket)

    test_features_sample = test_features.sample(True, 0.1)

    fitted = model.transform(test_features_sample).select(
        'spatial_key', 'column_index', 'row_index', 'probability',
        'prediction')
    # fitted.cache()
    # fitted.show()
    grouped = fitted.groupBy('spatial_key')

    # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses:
    # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability
    # (This did not work without the UDF!)
    firstelement = F.udf(lambda v: float(v[0]), FloatType())
    # added this UDF to select the probability of field rather than no field to write to probability images
    secondelement = F.udf(lambda v: float(v[1]), FloatType())

    logger.warn(
        "Elapsed time for classifying test grids: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    if probability_images > 0 or complete_catalog:
        logger.warn("Write catalog of {} probability images".format(
            probability_images))
        checkpoint = time.time()

        if complete_catalog:

            # new catalog
            image_catalog_fix = spark.read \
                .option('inferScheme', True) \
                .option('header', True) \
                .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \
                .repartition('col', 'row')
            all_image_uris_fix = image_catalog_fix \
                .filter(image_catalog_fix['season'] == 'GS') \
                .alias('gs') \
                .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'),
                      (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \
                .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))

            #recollect all pixels for all testing images
            compreh_names = f_pool.join(qs_in,
                                        ['name', 'col', 'row', 'name_col_row'],
                                        'outer')
            features_compreh = gather_data(all_image_uris_fix, compreh_names,
                                           master_metadata, feature_names,
                                           s3_bucket)
            fitted_compreh = model.transform(features_compreh)\
                 .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_compreh = fitted_compreh.groupBy('spatial_key')
            # added to test sampling
            assembled = grouped_compreh.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        else:
            ####################################
            logger.warn("Identify worst performing cells")
            checkpoint = time.time()
            # TODO: Determine which images to take
            certainty = grouped \
                .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache()
            certainty.show()

            worst_keys_rdd = certainty \
                .sort('certainty') \
                .select('spatial_key') \
                .limit(round(certainty.count() * 0.05)) \
                .rdd.takeSample(False, (params['number_outgoing_names']))
            worst_keys = spark.createDataFrame(worst_keys_rdd)
            outgoing_names = worst_keys \
                .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \
                .select('name') \
                .withColumn('run', lit(run_id)) \
                .withColumn('iteration', lit(last_iteration + 1)) \
                .withColumn('processed', lit(False)) \
                .withColumn('usage', lit('train')) \
                .toPandas()
            uri = urlparse.urlparse(params['outgoing'])
            pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:])
            logger.warn(
                "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s"
                .format(time.time() - checkpoint))

            ###########################################
            checkpoint = time.time()
            # sampling testing images (num = probability_images)
            filtered_names_sample = filtered_names\
                .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\
                .select('scene_id')\
                .dropDuplicates()\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\
                .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\
                .select('name', 'col', 'row', 'name_col_row')

            #re-collect all pixels within sampled images
            features_images = gather_data(all_image_uris,
                                          filtered_names_sample,
                                          master_metadata, feature_names,
                                          s3_bucket)
            #reclassify sampled testing images
            fitted_images = model.transform(features_images)\
                    .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_sample = fitted_images.join(
                filtered_names_sample, (col('spatial_key.col') == col('col')) &
                (col('spatial_key.row') == col('row'))).groupby('spatial_key')
            assembled = grouped_sample.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        coarse_layout = gps.LayoutDefinition(
            gps.Extent(-17.541, -35.46, 51.459, 37.54),
            gps.TileLayout(1380, 1460, 2000, 2000))
        # we multiply by 100 to select digits that will be kept after converting from float to int.
        # range of int8 is to 128, so we can only preserve 2 sig figs
        output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\
                            .tile_to_layout(coarse_layout)\
                            .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED)

        cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[
            'image_output_pattern']
        output_tiles.foreach(lambda pair: write_bytes_to_s3(
            cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id,
                                str(last_iteration + 1)), pair[1]))
        logger.warn(
            "Elapsed time for writing catalog of probability images: {}s".
            format(time.time() - checkpoint))
Пример #15
0
def load_test_collection(
    collection_id: str,
    collection_metadata: GeopysparkCubeMetadata,
    extent,
    srs: str,
    from_date: str,
    to_date: str,
    bands=None,
    correlation_id: str = "NA",
) -> Dict[int, geopyspark.TiledRasterLayer]:
    """
    Load synthetic data as test collection
    :param collection_id:
    :param collection_metadata:
    :param extent:
    :param srs:
    :param from_date:
    :param to_date:
    :param bands:
    :param correlation_id:
    :return:
    """
    # TODO: support more test collections
    assert collection_id == "TestCollection-LonLat4x4"
    grid_size: float = 1.0
    tile_size = 4

    # TODO: support other srs'es?
    assert srs == "EPSG:4326"

    # Get bounds of tiling layout
    extent = geopyspark.Extent(extent.xmin(), extent.ymin(), extent.xmax(),
                               extent.ymax())
    col_min = int(math.floor(extent.xmin / grid_size))
    row_min = int(math.floor(extent.ymin / grid_size))
    col_max = int(math.ceil(extent.xmax / grid_size) - 1)
    row_max = int(math.ceil(extent.ymax / grid_size) - 1)

    # Simulate sparse range of observation dates
    from_date = rfc3339.parse_datetime(rfc3339.datetime(from_date))
    to_date = rfc3339.parse_datetime(rfc3339.datetime(to_date))
    dates = dates_between(from_date, to_date)

    # Build RDD of tiles with requested bands.
    tile_builder = TestCollectionLonLat(tile_size=tile_size,
                                        grid_size=grid_size)
    bands = bands or [b.name for b in collection_metadata.bands]
    rdd_data = [(SpaceTimeKey(col, row, date),
                 tile_builder.get_tile(bands=bands,
                                       col=col,
                                       row=row,
                                       date=date))
                for col in range(col_min, col_max + 1)
                for row in range(row_min, row_max + 1) for date in dates]
    rdd = SparkContext.getOrCreate().parallelize(rdd_data)

    metadata = Metadata(
        bounds=Bounds(SpaceTimeKey(col_min, row_min, min(dates)),
                      SpaceTimeKey(col_max, row_max, max(dates))),
        crs="+proj=longlat +datum=WGS84 +no_defs ",
        cell_type=CellType.FLOAT64,
        extent=extent,
        layout_definition=LayoutDefinition(
            extent=geopyspark.Extent(col_min * grid_size, row_min * grid_size,
                                     (col_max + 1) * grid_size,
                                     (row_max + 1) * grid_size),
            tileLayout=TileLayout(layoutCols=col_max - col_min + 1,
                                  layoutRows=row_max - row_min + 1,
                                  tileCols=tile_size,
                                  tileRows=tile_size)))
    layer = TiledRasterLayer.from_numpy_rdd(LayerType.SPACETIME, rdd, metadata)
    return {0: layer}