def init_sc(): global sc if not sc: conf = gps.geopyspark_conf(appName="geopyspark-example", master="local[*]") conf.set(key='spark.ui.enabled', value='true') sc = SparkContext(conf=conf)
class BaseTestClass(unittest.TestCase): if 'TRAVIS' in os.environ: master_str = "local[2]" else: master_str = "local[*]" conf = geopyspark_conf(master=master_str, appName="test") conf.set('spark.kryoserializer.buffer.max', value='1G') conf.set('spark.ui.enabled', True) if 'TRAVIS' in os.environ: conf.set(key='spark.driver.memory', value='1G') conf.set(key='spark.executor.memory', value='1G') pysc = SparkContext(conf=conf) dir_path = file_path("all-ones.tif") rdd = get(LayerType.SPATIAL, dir_path, max_tile_size=1024) value = rdd.to_numpy_rdd().collect()[0] projected_extent = value[0] extent = projected_extent.extent expected_tile = value[1].cells (_, rows, cols) = expected_tile.shape layout = TileLayout(1, 1, cols, rows)
def rdd_tms_server(catalog_path, resource_name): conf = gps.geopyspark_conf(master="local[*]", appName="master") pysc = SparkContext(conf=conf) print("Tiles Catalog Path") catalog_path = "file://" + catalog_path data_name = resource_name print("Color Setting") color_dict = {} for i in range(120000): color_dict[i] = int(random_color(), 16) cm = gps.ColorMap.build(color_dict) print('TMS Setting') tms = gps.TMS.build(source=(catalog_path, data_name), display=cm) print('Set up TMS server') tms.bind(host="0.0.0.0", requested_port=8085) print(tms.url_pattern) time.sleep(365 * 24 * 60 * 60) print('Shutdown TMS server') tms.unbind()
def _setup_local_spark(print: Callable = print, verbosity=0, master_url="local[2]", app_name="openEO-GeoPySpark-Driver", additional_jar_dirs=[]): print("Setting up local Spark") if 'PYSPARK_PYTHON' not in os.environ: os.environ['PYSPARK_PYTHON'] = sys.executable _ensure_geopyspark(print=print) from geopyspark import geopyspark_conf from pyspark import SparkContext conf = geopyspark_conf(master=master_url, appName=app_name, additional_jar_dirs=additional_jar_dirs) conf.set('spark.kryoserializer.buffer.max', value='1G') # Only show spark progress bars for high verbosity levels conf.set('spark.ui.showConsoleProgress', verbosity >= 3) conf.set('spark.ui.enabled', True) # TODO: allow finetuning the config more? print("SparkContext.getOrCreate with {c!r}".format(c=conf.getAll())) context = SparkContext.getOrCreate(conf) return context
def setup_local_spark(): from pyspark import find_spark_home, SparkContext spark_python = os.path.join(find_spark_home._find_spark_home(), 'python') py4j = glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0] sys.path[:0] = [spark_python, py4j] _log.debug('sys.path: {p!r}'.format(p=sys.path)) if 'TRAVIS' in os.environ: master_str = "local[2]" else: master_str = "local[*]" from geopyspark import geopyspark_conf conf = geopyspark_conf(master=master_str, appName="openeo-geotrellis-local") conf.set('spark.kryoserializer.buffer.max', value='1G') conf.set('spark.ui.enabled', True) # Some options to allow attaching a Java debugger to running Spark driver conf.set('spark.driver.extraJavaOptions', '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5009') if 'TRAVIS' in os.environ: conf.set(key='spark.driver.memory', value='2G') conf.set(key='spark.executor.memory', value='2G') if 'PYSPARK_PYTHON' not in os.environ: os.environ['PYSPARK_PYTHON'] = sys.executable _log.info('Creating Spark context with config:') for k, v in conf.getAll(): _log.info("Spark config: {k!r}: {v!r}".format(k=k, v=v)) pysc = SparkContext.getOrCreate(conf) _log.info('Created Spark Context {s}'.format(s=pysc)) _log.info('Spark web UI: http://localhost:{p}/'.format(p=pysc.getConf().get('spark.ui.port') or 4040)) return pysc
def test(): conf = gps.geopyspark_conf(master="local[*]", appName="master") pysc = SparkContext(conf=conf) layer_metadata = gps.read_layer_metadata(uri="file:///usr/local/large_scale_hydro/catalog", layer_name="demo-dem", layer_zoom=0) layer_extent = layer_metadata.extent print(layer_extent) poly = None # Creates a Polygon from Geojson json_path = '/usr/local/large_scale_hydro/result/polygon.geojson' with open(json_path) as f: js = json.load(f) features = js['features'] if features[0]['geometry']['type'] == 'Polygon': polygon = features[0]['geometry']['coordinates'] points = polygon[0] input_array = [] for point in points: input_array.append(tuple(point)) poly = Polygon(input_array) tiled_raster_layer = gps.query(uri="file:///usr/local/large_scale_hydro/catalog", layer_name="demo-dem", layer_zoom=0, query_geom=poly) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched('/usr/local/large_scale_hydro/result/result_geojson.tif')
def data_search(catalog_path, json_path, dem_tif_path, dir_tif_path, acc_tif_path): catalog_path = "file://" + catalog_path conf = gps.geopyspark_conf(master="local[*]", appName="master") pysc = SparkContext(conf=conf) polys = None # Creates a MultiPolygon from Geojson with open(json_path) as f: js = json.load(f) FeatureObj = None if js['type'] == 'FeatureCollection': FeatureObj = js['features'][0] elif js['type'] == 'Feature': FeatureObj = js if FeatureObj['geometry']['type'] == 'MultiPolygon': polygons = FeatureObj['geometry']['coordinates'] polygons_array = [] for polygon in polygons: input_array = [] for point in polygon[0]: input_array.append(tuple(point)) polygons_array.append(Polygon(input_array)) polys = MultiPolygon(polygons_array) elif FeatureObj['geometry']['type'] == 'Polygon': polygon = FeatureObj['geometry']['coordinates'] points = polygon[0] input_array = [] for point in points: input_array.append(tuple(point)) polys = Polygon(input_array) print("Get DEM") tiled_raster_layer = gps.query(uri=catalog_path, layer_name="dem", layer_zoom=0, query_geom=polys) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(dem_tif_path) print("Get Direction") tiled_raster_layer = gps.query(uri=catalog_path, layer_name="direction", layer_zoom=0, query_geom=polys) # tiled_raster_layer = gps.query(uri=catalog_path, layer_name="dir", layer_zoom=0, query_geom=polys) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(dir_tif_path) print("Get Accumulation") tiled_raster_layer = gps.query(uri=catalog_path, layer_name="accumulation", layer_zoom=0, query_geom=polys) # tiled_raster_layer = gps.query(uri=catalog_path, layer_name="acc", layer_zoom=0, query_geom=polys) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(acc_tif_path)
def __init__(self): master_str = "local[*]" conf = geopyspark_conf(master=master_str, appName="test") conf.set('spark.kryoserializer.buffer.max', value='1G') conf.set('spark.ui.enabled', True) if ConfigParams().is_ci_context: conf.set(key='spark.driver.memory', value='2G') conf.set(key='spark.executor.memory', value='2G') self.pysc = SparkContext.getOrCreate(conf) self.first = np.zeros((1, 4, 4)) self.first.fill(1) self.second = np.zeros((1, 4, 4)) self.second.fill(2) self.extent = {'xmin': 0.0, 'ymin': 0.0, 'xmax': 4.0, 'ymax': 4.0} self.layout = { 'layoutCols': 1, 'layoutRows': 1, 'tileCols': 4, 'tileRows': 4 } self.now = datetime.datetime.strptime("2017-09-25T11:37:00Z", '%Y-%m-%dT%H:%M:%SZ')
def _local_spark_conf() -> SparkConf: import geopyspark as gps conf = gps.geopyspark_conf(master="local[*]", appName="benchmark.py") conf.set('spark.yarn.keytab', "/home/bossie/Documents/VITO/vdboschj.keytab") conf.set('spark.yarn.principal', "vdboschj") return conf
def _setup_local_spark(out: TerminalReporter, verbosity=0): # TODO make a "spark_context" fixture instead of doing this through pytest_configure out.write_line("[conftest.py] Setting up local Spark") travis_mode = 'TRAVIS' in os.environ master_str = "local[2]" if travis_mode else "local[2]" if 'PYSPARK_PYTHON' not in os.environ: os.environ['PYSPARK_PYTHON'] = sys.executable from geopyspark import geopyspark_conf from pyspark import SparkContext conf = geopyspark_conf(master=master_str, appName="OpenEO-GeoPySpark-Driver-Tests") conf.set('spark.kryoserializer.buffer.max', value='1G') conf.set(key='spark.kryo.registrator', value='geopyspark.geotools.kryo.ExpandedKryoRegistrator') conf.set( key='spark.kryo.classesToRegister', value= 'org.openeo.geotrellisaccumulo.SerializableConfiguration,ar.com.hjg.pngj.ImageInfo,ar.com.hjg.pngj.ImageLineInt,geotrellis.raster.RasterRegion$GridBoundsRasterRegion' ) # Only show spark progress bars for high verbosity levels conf.set('spark.ui.showConsoleProgress', verbosity >= 3) if travis_mode: conf.set(key='spark.driver.memory', value='2G') conf.set(key='spark.executor.memory', value='2G') conf.set('spark.ui.enabled', False) else: conf.set('spark.ui.enabled', True) out.write_line("[conftest.py] SparkContext.getOrCreate with {c!r}".format( c=conf.getAll())) context = SparkContext.getOrCreate(conf) out.write_line("[conftest.py] JVM info: {d!r}".format( d={ f: context._jvm.System.getProperty(f) for f in [ "java.version", "java.vendor", "java.home", "java.class.version", # "java.class.path", ] })) out.write_line("[conftest.py] Validating the Spark context") dummy = context._jvm.org.openeo.geotrellis.OpenEOProcesses() answer = context.parallelize([9, 10, 11, 12]).sum() out.write_line("[conftest.py] " + repr((answer, dummy))) return context
def _setup_local_spark(out: TerminalReporter, verbosity=0): # TODO make a "spark_context" fixture instead of doing this through pytest_configure out.write_line("Setting up local Spark") travis_mode = 'TRAVIS' in os.environ master_str = "local[2]" if travis_mode else "local[*]" from geopyspark import geopyspark_conf from pyspark import SparkContext conf = geopyspark_conf(master=master_str, appName="OpenEO-GeoPySpark-Driver-Tests") conf.set('spark.kryoserializer.buffer.max', value='1G') # Only show spark progress bars for high verbosity levels conf.set('spark.ui.showConsoleProgress', verbosity >= 3) if travis_mode: conf.set(key='spark.driver.memory', value='2G') conf.set(key='spark.executor.memory', value='2G') conf.set('spark.ui.enabled', False) else: conf.set('spark.ui.enabled', True) out.write_line("SparkContext.getOrCreate with {c!r}".format(c=conf.getAll())) context = SparkContext.getOrCreate(conf) out.write_line("JVM info: {d!r}".format(d={ f: context._jvm.System.getProperty(f) for f in [ "java.version", "java.vendor", "java.home", "java.class.version", # "java.class.path", ] })) out.write_line("Validating the Spark context") dummy = context._jvm.org.openeo.geotrellis.OpenEOProcesses() answer = context.parallelize([9, 10, 11, 12]).sum() out.write_line(repr((answer, dummy))) return context
def main(data_path, area_of_interest, outfile): # Create the SparkContext conf = gps.geopyspark_conf(appName="geodatafw", master="local[*]") sc = SparkContext(conf=conf) # Create raster_layer object from Sentinel 2 data raster_layer = get_raster_layer(sc, data_path) # Tile the rasters within the layer and reproject them to Web Mercator. tiled_layer = raster_layer.tile_to_layout(layout=gps.GlobalLayout(), target_crs=3857) # Mask the tiles within the layer with the area of interest masked = tiled_layer.mask(geometries=area_of_interest) # We will now pyramid the masked TiledRasterLayer so that we can use it in a TMS server later. # pyramided_mask = masked.pyramid() # Save each layer of the pyramid locally so that it can be accessed at a later time. #for pyramid in pyramided_mask.levels.values(): gps.write(uri='file://%s' % outfile, layer_name='munsmo', tiled_raster_layer=masked)
def test(work_path): process_path = work_path + "/process" if not os.path.exists(process_path): os.makedirs(process_path) conf = gps.geopyspark_conf(master="local[*]", appName="master") pysc = SparkContext(conf=conf) poly = None # Creates a Polygon from Geojson json_path = '/usr/local/large_scale_hydro/result/polygon.geojson' with open(json_path) as f: js = json.load(f) FeatureObj = None if js['type'] == 'FeatureCollection': FeatureObj = js['features'][0] elif js['type'] == 'Feature': FeatureObj = js if FeatureObj['geometry']['type'] == 'MultiPolygon': polygons = FeatureObj['geometry']['coordinates'] polygons_array = [] for polygon in polygons: input_array = [] for point in polygon[0]: input_array.append(tuple(point)) polygons_array.append(Polygon(input_array)) polys = MultiPolygon(polygons_array) elif FeatureObj['geometry']['type'] == 'Polygon': polygon = FeatureObj['geometry']['coordinates'] points = polygon[0] input_array = [] for point in points: input_array.append(tuple(point)) polys = Polygon(input_array) dem_tif_path = process_path + '/dem.tif' print("Get DEM") tiled_raster_layer = gps.query( uri="file:///usr/local/large_scale_hydro/catalog", layer_name="dem", layer_zoom=0, query_geom=polys) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(dem_tif_path) dir_tif_path = process_path + '/dir.tif' print("Get Direction") tiled_raster_layer = gps.query( uri="file:///usr/local/large_scale_hydro/catalog", layer_name="direction", layer_zoom=0, query_geom=poly) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(dir_tif_path) acc_tif_path = process_path + '/acc.tif' print("Get Accumulation") tiled_raster_layer = gps.query( uri="file:///usr/local/large_scale_hydro/catalog", layer_name="accumulation", layer_zoom=0, query_geom=poly) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(acc_tif_path) lake_tif_path = process_path + '/lakes.tif' print("Get Lakes") tiled_raster_layer = gps.query( uri="file:///usr/local/large_scale_hydro/catalog", layer_name="lakes", layer_zoom=0, query_geom=poly) print(tiled_raster_layer.count()) print(tiled_raster_layer.layer_metadata.extent) tiled_raster_layer.save_stitched(lake_tif_path)
import geopyspark as gps from pyspark import SparkContext from shapely.geometry import box # Create the SparkContext conf = gps.geopyspark_conf(appName="geopyspark-example", master="local[*]") sc = SparkContext(conf=conf) # Read in the NLCD tif that has been saved locally. # This tif represents the state of Pennsylvania. raster_layer = gps.geotiff.get(layer_type=gps.LayerType.SPATIAL, uri='/tmp/NLCD2011_LC_Pennsylvania.tif', num_partitions=100) # Tile the rasters within the layer and reproject them to Web Mercator. tiled_layer = raster_layer.tile_to_layout(layout=gps.GlobalLayout(), target_crs=3857) # Creates a Polygon that covers roughly the north-west section of Philadelphia. # This is the region that will be masked. area_of_interest = box(-75.229225, 40.003686, -75.107345, 40.084375) # Mask the tiles within the layer with the area of interest masked = tiled_layer.mask(geometries=area_of_interest) # We will now pyramid the masked TiledRasterLayer so that we can use it in a TMS server later. pyramided_mask = masked.pyramid() # Save each layer of the pyramid locally so that it can be accessed at a later time. for pyramid in pyramided_mask.levels.values():
new_line.pop('uri') return new_line def make_tiles(line): projected_extent = line[0] bands = sorted(line[1], key=lambda l: l['band']) array = np.array([l['data'] for l in bands]) tile = gps.Tile.from_numpy_array(array, no_data_value=0) return (projected_extent, tile) if __name__ == "__main__": sc = SparkContext(conf=gps.geopyspark_conf( appName="Landsat").set("spark.ui.enabled", True)) csv_data = [{ 'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B9.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '9' }, { 'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B9.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '9' }, { 'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B9.TIF',
def overlay_geopyspark_conf(appName='ClimateOverlay', memory='12G'): conf = gps.geopyspark_conf(appName="ClimateOverlay") conf.set('spark.master.memory', '12G') conf.set('spark.ui.enabled', True) sc = SparkContext(conf=conf)
from threading import Thread import requests from flask import Flask, request # for strange reasons, one has to import netCDF4 before geopyspark on some systems, otherwise reading nc-files fails. import netCDF4 import geopyspark as gps from pyspark import SparkContext from cuizinart_pyspark import slice from pyspark_settings import SPARK_MASTER, CUIZINART_URL, NC_OUTPUT_PATH, CUIZINART_PYSPARK_PASSWORD logger = logging.getLogger('pyspark') app = Flask('cuizinart_pyspark') conf = gps.geopyspark_conf(appName='gwf', master=SPARK_MASTER) sc = SparkContext(conf=conf) @app.route('/process_query', methods=['POST']) def process_query(): request_json = request.get_json() request_id = request_json['request_id'] user_email = request_json['user_email'] product = request_json['product'] geojson_shape = request_json['geojson_shape'] start_time = request_json['start_time'] end_time = request_json['end_time'] request_vars = request_json['request_vars'] horizons = request_json['horizons'] issues = request_json['issues']
import datetime import geopyspark as gps import numpy as np from pyspark import SparkContext from shapely.geometry import MultiPolygon, box conf = gps.geopyspark_conf(master="local[*]", appName="layers") pysc = SparkContext(conf=conf) uri = "file:/data/workspace/geotrellis-landsat-tutorial/data/1k_tiles/" layer_name = "landsat1K" metadata = gps.read_layer_metadata(uri=uri, layer_name=layer_name, layer_zoom=0) print(metadata) # Get list of tiles print(metadata.bounds) # Read the first tile tile = gps.read_value(uri=uri, layer_name=layer_name, layer_zoom=0, col=metadata.bounds.minKey.col, row=metadata.bounds.minKey.row) print(tile) # Read the layer
import geopyspark import numpy from pyspark.sql import SparkSession from pyrasterframes import * from pyrasterframes.rasterfunctions import * conf = geopyspark.geopyspark_conf(appName="POC") session = SparkSession.builder.config( conf=conf).getOrCreate().withRasterFrames() uri = "file:/data/workspace/geotrellis-landsat-tutorial/data/1k_tiles/" layer_name = "landsat1K" layer = geopyspark.query(uri=uri, layer_name=layer_name, layer_zoom=0) rf = layer.to_rasterframe(3) rf.show(2) # Show CRS rf.tileLayerMetadata()['crs'] # Convert Tile data to array rf.select(tileToDoubleArray("tile_1")).show(10, 80) # Global aggregation statistics rf.agg(aggNoDataCells("tile_1"), aggDataCells("tile_1"), aggMean("tile_1")).show(5, False) # Tile aggregation statistics rf.select(tileMean("tile_1"), tileMin("tile_1"), tileMax("tile_1")).show(5)