def test_mask(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile, CellType np.random.seed(999) # importantly exclude 0 from teh range because that's the nodata value for the `data_tile`'s cell type ma = np.ma.array(np.random.randint(1, 10, (5, 5), dtype='int8'), mask=np.random.rand(5, 5) > 0.7) expected_data_values = ma.compressed().size expected_no_data_values = ma.size - expected_data_values self.assertTrue(expected_data_values > 0, "Make sure random seed is cooperative ") self.assertTrue(expected_no_data_values > 0, "Make sure random seed is cooperative ") data_tile = Tile(np.ones(ma.shape, ma.dtype), CellType.uint8()) df = self.spark.createDataFrame([Row(t=data_tile, m=Tile(ma))]) \ .withColumn('masked_t', rf_mask('t', 'm')) result = df.select(rf_data_cells('masked_t')).first()[0] self.assertEqual( result, expected_data_values, f"Masked tile should have {expected_data_values} data values but found: {df.select('masked_t').first()[0].cells}." f"Original data: {data_tile.cells}" f"Masked by {ma}") nd_result = df.select(rf_no_data_cells('masked_t')).first()[0] self.assertEqual(nd_result, expected_no_data_values) # deser of tile is correct self.assertEqual( df.select('masked_t').first()[0].cells.compressed().size, expected_data_values)
def test_mask_bits(self): t = Tile(42 * np.ones((4, 4), 'uint16'), CellType.uint16()) # with a varitey of known values mask = Tile( np.array([[1, 1, 2720, 2720], [1, 6816, 6816, 2756], [2720, 2720, 6900, 2720], [2720, 6900, 6816, 1]]), CellType('uint16raw')) df = self.spark.createDataFrame([Row(t=t, mask=mask)]) # removes fill value 1 mask_fill_df = df.select( rf_mask_by_bit('t', 'mask', 0, True).alias('mbb')) mask_fill_tile = mask_fill_df.first()['mbb'] self.assertTrue(mask_fill_tile.cell_type.has_no_data()) self.assertTrue( mask_fill_df.select(rf_data_cells('mbb')).first()[0], 16 - 4) # mask out 6816, 6900 mask_med_hi_cir = df.withColumn('mask_cir_mh', rf_mask_by_bits('t', 'mask', 11, 2, [2, 3])) \ .first()['mask_cir_mh'].cells self.assertEqual(mask_med_hi_cir.mask.sum(), 5)
def test_local_min_max_clamp(self): tile = Tile(np.random.randint(-20, 20, (10, 10)), CellType.int8()) min_tile = Tile(np.random.randint(-20, 0, (10, 10)), CellType.int8()) max_tile = Tile(np.random.randint(0, 20, (10, 10)), CellType.int8()) df = self.spark.createDataFrame( [Row(t=tile, mn=min_tile, mx=max_tile)]) assert_equal( df.select(rf_local_min('t', 'mn')).first()[0].cells, np.clip(tile.cells, None, min_tile.cells)) assert_equal( df.select(rf_local_min('t', -5)).first()[0].cells, np.clip(tile.cells, None, -5)) assert_equal( df.select(rf_local_max('t', 'mx')).first()[0].cells, np.clip(tile.cells, max_tile.cells, None)) assert_equal( df.select(rf_local_max('t', 5)).first()[0].cells, np.clip(tile.cells, 5, None)) assert_equal( df.select(rf_local_clamp('t', 'mn', 'mx')).first()[0].cells, np.clip(tile.cells, min_tile.cells, max_tile.cells))
def test_rf_where(self): cond = Tile(np.random.binomial(1, 0.35, (10, 10)), CellType.uint8()) x = Tile(np.random.randint(-20, 10, (10, 10)), CellType.int8()) y = Tile(np.random.randint(0, 30, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(cond=cond, x=x, y=y)]) result = df.select(rf_where('cond', 'x', 'y')).first()[0].cells assert_equal(result, np.where(cond.cells, x.cells, y.cells))
def test_mask_by_values(self): tile = Tile(np.random.randint(1, 100, (5, 5)), CellType.uint8()) mask_tile = Tile(np.array(range(1, 26), 'uint8').reshape(5, 5)) expected_diag_nd = Tile(np.ma.masked_array(tile.cells, mask=np.eye(5))) df = self.spark.createDataFrame([Row(t=tile, m=mask_tile)]) \ .select(rf_mask_by_values('t', 'm', [0, 6, 12, 18, 24])) # values on the diagonal result0 = df.first() # assert_equal(result0[0].cells, expected_diag_nd) self.assertTrue(result0[0] == expected_diag_nd)
def test_rf_rescale_per_tile(self): x1 = Tile(np.random.randint(-20, 42, (10, 10)), CellType.int8()) x2 = Tile(np.random.randint(20, 242, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x1), Row(x=x2)]) result = df.select(rf_rescale('x').alias('x_prime')) \ .agg(rf_agg_stats('x_prime').alias('stat')) \ .select('stat.min', 'stat.max') \ .first() self.assertEqual(result[0], 0.0) self.assertEqual(result[1], 1.0)
def test_rf_local_is_in(self): from pyspark.sql.functions import lit, array, col from pyspark.sql import Row nd = 5 t = Tile(np.array([[1, 3, 4], [nd, 0, 3]]), CellType.uint8().with_no_data_value(nd)) # note the convert is due to issue #188 df = self.spark.createDataFrame([Row(t=t)]) \ .withColumn('a', array(lit(3), lit(4))) \ .withColumn('in2', rf_convert_cell_type( rf_local_is_in(col('t'), array(lit(0), lit(4))), 'uint8')) \ .withColumn('in3', rf_convert_cell_type(rf_local_is_in('t', 'a'), 'uint8')) \ .withColumn('in4', rf_convert_cell_type( rf_local_is_in('t', array(lit(0), lit(4), lit(3))), 'uint8')) \ .withColumn('in_list', rf_convert_cell_type(rf_local_is_in(col('t'), [4, 1]), 'uint8')) result = df.first() self.assertEqual(result['in2'].cells.sum(), 2) assert_equal(result['in2'].cells, np.isin(t.cells, np.array([0, 4]))) self.assertEqual(result['in3'].cells.sum(), 3) self.assertEqual(result['in4'].cells.sum(), 4) self.assertEqual( result['in_list'].cells.sum(), 2, "Tile value {} should contain two 1s as: [[1, 0, 1],[0, 0, 0]]". format(result['in_list'].cells))
def test_agg_local_mean(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile # this is really testing the nodata propagation in the agg local summation ct = CellType.int8().with_no_data_value(4) df = self.spark.createDataFrame([ Row(tile=Tile(np.array([[1, 2, 3, 4, 5, 6]]), ct)), Row(tile=Tile(np.array([[1, 2, 4, 3, 5, 6]]), ct)), ]) result = df.agg(rf_agg_local_mean('tile').alias('mean')).first().mean expected = Tile(np.array([[1.0, 2.0, 3.0, 3.0, 5.0, 6.0]]), CellType.float64()) self.assertEqual(result, expected)
def test_extract_bits(self): one = np.ones((6, 6), 'uint8') t = Tile(84 * one) df = self.spark.createDataFrame([Row(t=t)]) result_py_literals = df.select(rf_local_extract_bits('t', 2, 3)).first()[0] # expect value binary 84 => 1010100 => 101 assert_equal(result_py_literals.cells, 5 * one) result_cols = df.select(rf_local_extract_bits('t', lit(2), lit(3))).first()[0] assert_equal(result_cols.cells, 5 * one)
def test_rf_standardize_per_tile(self): # 10k samples so should be pretty stable x = Tile(np.random.randint(-20, 0, (100, 100)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x)]) result = df.select(rf_standardize('x').alias('z')) \ .select(rf_agg_stats('z').alias('z_stat')) \ .select('z_stat.mean', 'z_stat.variance') \ .first() self.assertAlmostEqual(result[0], 0.0) self.assertAlmostEqual(result[1], 1.0)
def test_mask_and_deser(self): # duplicates much of test_mask_bits but t = Tile(42 * np.ones((4, 4), 'uint16'), CellType.uint16()) # with a varitey of known values mask = Tile( np.array([[1, 1, 2720, 2720], [1, 6816, 6816, 2756], [2720, 2720, 6900, 2720], [2720, 6900, 6816, 1]]), CellType('uint16raw')) df = self.spark.createDataFrame([Row(t=t, mask=mask)]) # removes fill value 1 mask_fill_df = df.select( rf_mask_by_bit('t', 'mask', 0, True).alias('mbb')) mask_fill_tile = mask_fill_df.first()['mbb'] self.assertTrue(mask_fill_tile.cell_type.has_no_data()) # Unsure why this fails. mask_fill_tile.cells is all 42 unmasked. self.assertEqual( mask_fill_tile.cells.mask.sum(), 4, f'Expected {16 - 4} data values but got the masked tile:' f'{mask_fill_tile}')
def test_rf_rescale(self): from pyspark.sql.functions import min as F_min from pyspark.sql.functions import max as F_max x1 = Tile(np.random.randint(-60, 12, (10, 10)), CellType.int8()) x2 = Tile(np.random.randint(15, 122, (10, 10)), CellType.int8()) df = self.spark.createDataFrame([Row(x=x1), Row(x=x2)]) # Note there will be some clipping rescaled = df.select(rf_rescale('x', -20, 50).alias('x_prime'), 'x') result = rescaled \ .agg( F_max(rf_tile_min('x_prime')), F_min(rf_tile_max('x_prime')) ).first() self.assertGreater( result[0], 0.0, f'Expected max tile_min to be > 0 (strictly); but it is ' f'{rescaled.select("x", "x_prime", rf_tile_min("x_prime")).take(2)}' ) self.assertLess( result[1], 1.0, f'Expected min tile_max to be < 1 (strictly); it is' f'{rescaled.select(rf_tile_max("x_prime")).take(2)}')
def test_rf_local_data_and_no_data(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile nd = 5 t = Tile(np.array([[1, 3, 4], [nd, 0, 3]]), CellType.uint8().with_no_data_value(nd)) # note the convert is due to issue #188 df = self.spark.createDataFrame([Row(t=t)])\ .withColumn('lnd', rf_convert_cell_type(rf_local_no_data('t'), 'uint8')) \ .withColumn('ld', rf_convert_cell_type(rf_local_data('t'), 'uint8')) result = df.first() result_nd = result['lnd'] assert_equal(result_nd.cells, t.cells.mask) result_d = result['ld'] assert_equal(result_d.cells, np.invert(t.cells.mask))
def test_rf_interpret_cell_type_as(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile df = self.spark.createDataFrame([ Row(t=Tile(np.array([[1, 3, 4], [5, 0, 3]]), CellType.uint8().with_no_data_value(5))) ]) df = df.withColumn('tile', rf_interpret_cell_type_as( 't', 'uint8ud3')) # threes become ND result = df.select( rf_tile_sum(rf_local_equal( 't', lit(3))).alias('threes')).first()['threes'] self.assertEqual(result, 2) result_5 = df.select( rf_tile_sum(rf_local_equal( 't', lit(5))).alias('fives')).first()['fives'] self.assertEqual(result_5, 0)