def CreateLoadArray(self, tempRastName, attribute_name, rasterArrayType): """ Create the loading array Input: tempRastName = The names of the raster attribute_name = The attribute to create on rasterArrayType = The type of the raster Output: None """ import scidb sdb = scidb.iquery() if rasterArrayType <= 2: theQuery = "create array %s <y1:int64, x1:int64, %s> [xy=0:*,?,?]" % ( tempRastName, attribute_name) elif rasterArrayType == 3: theQuery = "create array %s <z1:int64, y1:int64, x1:int64, %s> [xy=0:*,?,?]" % ( tempRastName, attribute_name) try: sdb.query(theQuery) except: # Silently deleting temp arrays sdb.query("remove(%s)" % tempRastName) sdb.query(theQuery)
def GetSciDBInstances(self): """ Setup the SciDB instances for the class Input: None Output: None """ from scidb import iquery sdb = iquery() query = sdb.queryAFL("list('instances')") self.SciDB_Instances = len(query.splitlines()) - 1
def GetNumberofSciDBInstances(): """ Get the number of running SciDB instances Input: None Output: Number of running SciDB instances """ import scidb sdb = scidb.iquery() query = sdb.queryAFL("list('instances')") numInstances = len(query.splitlines()) - 1 numInstances = list(range(numInstances)) return numInstances
def __init__(self, boundaryPath, rasterPath, SciDBArray): """ Initialization of a ZonalStats object Input: boundaryPath = Path of the boundary rasterPath = Path of the raster SciDBArray = SciBD array Output: An instance of the ZonalStats class """ import scidb self.sdb = scidb.iquery() self.__SciDBInstances() self.vectorPath = boundaryPath self.geoTiffPath = rasterPath self.SciDBArrayName = SciDBArray
def CreateDestinationArray(self, rasterArrayName, height, width, chunk, overlap): """ Function creates the final destination array. Updated to handle 3D arrays. Input: rasterArrayName = The name of the array to create height = The height of the array to create width = The width of the array to create chunk = The chunk to create with overlap = The overlap of the array Output: None """ import scidb sdb = scidb.iquery() if self.RasterArrayShape <= 2: myQuery = "create array %s <%s> [y=0:%s,%s,%s; x=0:%s,%s,%s]" % ( rasterArrayName, self.AttributeString, height - 1, chunk, overlap, width - 1, chunk, overlap) else: # Add in bands to the query myQuery = "create array %s <%s> [band=0:%s,1,%s; y=0:%s,%s,0; x=0:%s,%s,%s]" % ( rasterArrayName, self.AttributeString, self.numbands - 1, height - 1, chunk, overlap, width - 1, chunk, overlap) try: sdb.query(myQuery) except: # Remove the array if it already exists then rerun the query print("***** Array %s already exists. Removing ****" % rasterArrayName) sdb.query("remove(%s)" % rasterArrayName) sdb.query(myQuery) del sdb
def CreateDestinationArray(self, rasterArrayName, height, width, chunk): """ Function creates the final destination array. Updated to handle 3D arrays. Input: rasterArrayName = Name of the destination array height = The height of the raster width = The width of the raster chunk = The size of the chunks Output: None """ import scidb sdb = scidb.iquery() # Attempt to create array, removing the previous one if it exists if self.RasterArrayShape <= 2: myQuery = "create array %s <%s> [y=0:%s,%s,0; x=0:%s,%s,0]" % ( rasterArrayName, self.AttributeString, height - 1, chunk, width - 1, chunk) else: myQuery = "create array %s <%s> [band=0:%s,1,0; y=0:%s,%s,0; x=0:%s,%s,0]" % ( rasterArrayName, self.AttributeString, self.numbands - 1, height - 1, chunk, width - 1, chunk) try: sdb.query(myQuery) except: print("***** Array %s already exists. Removing ****" % rasterArrayName) sdb.query("remove(%s)" % rasterArrayName) print("here2") sdb.query(myQuery) print("here3") del sdb
def GDALReader(inParams): """ This is the main worker function. Split up Loading and Redimensioning. Only Loading is multiprocessing Input: inParams = A tuple or list containing the following: theMetadata = Metadata for the reading theInstance = Instance to read from theRasterPath = Path to the raster to read theSciDBOutPath = Out path for SciDB processing theSciDBLoadPath = Load path for SciDB processing bandIndex = Index of the band to process on Output: A tuple in the following format: (metadata for the raster, write time for the raster, load time for the raster) """ theMetadata = inParams[0] theInstance = inParams[1] theRasterPath = inParams[2] theSciDBOutPath = inParams[3] theSciDBLoadPath = inParams[4] bandIndex = inParams[5] from scidb import iquery, Statements sdb = iquery() sdb_statements = Statements(sdb) tempArray = "temprast_%s" % (theMetadata['version']) rasterBinaryFilePath = "%s/%s.sdbbin" % (theSciDBOutPath, tempArray) rasterBinaryLoadPath = "%s/%s.sdbbin" % (theSciDBLoadPath, tempArray) print("xoffset: %s, yOffSet: %s, xWindow: %s, yWindow: %s " % (theMetadata['xOffSet'], theMetadata['yOffSet'], theMetadata['xWindow'], theMetadata['yWindow'])) raster = gdal.Open(theRasterPath, GA_ReadOnly) if bandIndex: # This code is for multibanded arrays, with z (band) dimension. print("**** Reading band %s" % bandIndex) band = raster.GetRasterBand(bandIndex) array = band.ReadAsArray(xoff=theMetadata['xOffSet'], yoff=theMetadata['yOffSet'], win_xsize=theMetadata['xWindow'], win_ysize=theMetadata['yWindow']) rasterBinaryFilePath = "%s/band%s_%s.sdbbin" % (theSciDBOutPath, bandIndex, tempArray) rasterBinaryLoadPath = "%s/band%s_%s.sdbbin" % (theSciDBLoadPath, bandIndex, tempArray) tempArray = "temprast_band%s_%s" % (bandIndex, theMetadata['version']) else: array = raster.ReadAsArray(xoff=theMetadata['xOffSet'], yoff=theMetadata['yOffSet'], xsize=theMetadata['xWindow'], ysize=theMetadata['yWindow']) # Time the array write start = timeit.default_timer() WriteArray(array, rasterBinaryFilePath, theMetadata['array_type'], theMetadata['attribute'], bandIndex) stop = timeit.default_timer() writeTime = stop - start # Process depending on array type if theMetadata['array_type'] == 2: items = [ "%s:%s" % (attribute.split(":")[0].strip() + "1", attribute.split(":")[1].strip()) for attribute in theMetadata['attribute'].split(",") ] pseudoAttributes = ", ".join(items) else: pseudoAttributes = "%s:%s" % ( theMetadata['attribute'].split(":")[0].strip() + "1", theMetadata['attribute'].split(":")[1].strip()) os.chmod(rasterBinaryFilePath, 0o755) # Support multiple attributes or 2D and 3D arrays sdb_statements.CreateLoadArray(tempArray, theMetadata['attribute'], theMetadata['array_type']) start = timeit.default_timer() if sdb_statements.LoadOneDimensionalArray(theInstance, tempArray, pseudoAttributes, theMetadata['array_type'], rasterBinaryLoadPath): stop = timeit.default_timer() loadTime = stop - start dataLoadingTime = ((writeTime + loadTime) * theMetadata["loops"]) / 60 if theMetadata['version'] == 0: print( "Estimated time for loading in minutes %s: WriteTime: %s, LoadTime: %s" % (dataLoadingTime, writeTime, loadTime)) # Clean up gc.collect() RedimensionAndInsertArray(sdb, tempArray, theMetadata['scidbArray'], theMetadata['array_type'], theMetadata['xOffSet'], theMetadata['yOffSet']) return theMetadata['version'], writeTime, loadTime else: print("Error Loading") return theMetadata['version'], -999, -999
return parser if __name__ == '__main__': """ Entry point for SciDB_analysis This file contains the functions used for performing spatial analyses in SciDB """ config = configparser.ConfigParser() config.read("config.ini") def parse(s): return json.loads(config.get("main", s)) args = argument_parser().parse_args() sdb = iquery() query = sdb.queryAFL("list('instances')") SciDBInstances = len(query.splitlines()) - 1 runs = parse("runs") filePath = parse("filePath") rasterStatsCSVBase = parse("rasterStatsCSVBase") if args.command == "overlap": datasets = args.func(config, 'overlap') else: datasets = args.func(config) timings = OrderedDict() for d in datasets: print(d)
def ParallelLoad(rasterReadingMetadata): """ This function is designed to load all sizes of arrays We are using a couple of custom functions to break the dataset into smaller pieces for repetive parallel writing / loading and then a single redimension store You can improve the performance by setting a high maxPixel threshold value. maxPixel = Number of pixels to read/write/load per loop. Make sure to consider the number of SciDB processes when setting maxPixel Input: rasterReadingMetadata = The raster data Output: None """ from scidb import iquery, Statements import timeit numProcesses = len(rasterReadingMetadata) sdb = iquery() sdb_statements = Statements(sdb) try: loadLoops = ArraySplicerLogic(rasterReadingMetadata[0]['width'], rasterReadingMetadata[0]['height'], 5000000) loadAttribute = "%s_1:%s" % ( rasterReadingMetadata[0]['attribute'].split(":")[0], rasterReadingMetadata[0]['attribute'].split(":")[1]) nodeLoopData = AdjustMetaData(loadLoops, rasterReadingMetadata) start = timeit.default_timer() for l, nodeLoopIteration in enumerate( np.array_split(list(nodeLoopData.items()), loadLoops)): # Have to initiate the pool for each loop pool = mp.Pool(numProcesses) print("Loading %s of %s" % (l + 1, loadLoops)) # Create the load array sdb_statements.CreateLoadArray( "LoadArray", loadAttribute, rasterReadingMetadata[0]['array_shape']) pool.imap(Read_Write_Raster, (n for n in nodeLoopIteration)) pool.close() pool.join() # Load the one dimension array and insert redimension startLoad = timeit.default_timer() sdb_statements.LoadOneDimensionalArray(-1, "LoadArray", loadAttribute, 1, 'pdataset.scidb') startRedimension = timeit.default_timer() sdb_statements.InsertRedimension( "LoadArray", rasterReadingMetadata[1]["destination_array"], oldvalue=loadAttribute.split(":")[0], newvalue='value') sdb.query("remove(LoadArray)") RemoveArrayVersions(sdb, rasterReadingMetadata[1]["destination_array"]) stop = timeit.default_timer() if l == 0: print( "Estimated time for loading the dataset in minutes %s: WriteTime: %s seconds, LoadTime: %s " "seconds, RedimensionTime: %s seconds" % ((stop - start) * loadLoops / 60, startLoad - start, startRedimension - startLoad, stop - startRedimension)) except Exception as e: print(e) print("Error")
def ParallelLoadByChunk(rasterReadingData): """ This function will do parallel loading that supports fast redimensioning Input: rasterReadingData = The raster data Output: None """ from scidb import iquery, Statements from itertools import cycle, chain from collections import Counter import timeit sdb = iquery() sdb_statements = Statements(sdb) query = sdb.queryAFL("list('instances')") scidbInstances = len(query.splitlines()) - 1 # Cycle through the instances with the given data for r, node in zip(rasterReadingData, cycle(range(scidbInstances))): rasterReadingData[r]["node"] = node # Counter dictionary which reports back how many times node x occured. # We are just interested in node 0 numberofNodeLoops = Counter(rasterReadingData[k]["node"] for k in rasterReadingData) loadLoops = numberofNodeLoops[0] aKey = list(rasterReadingData.keys())[0] loadAttribute = "%s_1:%s" % (rasterReadingData[aKey]['attribute'].split( ":")[0], rasterReadingData[aKey]['attribute'].split(":")[1]) try: start = timeit.default_timer() for l, nodeLoopIteration in enumerate( np.array_split(list(rasterReadingData.items()), loadLoops)): # Create load arrsy pool = mp.Pool(scidbInstances) print("Loading %s of %s" % (l, loadLoops - 1)) sdb_statements.CreateLoadArray( "LoadArray", loadAttribute, int(nodeLoopIteration[0][1]['array_shape'])) pool.imap(Read_Write_Raster, (n for n in nodeLoopIteration)) pool.close() pool.join() # Load the one dimension array and insert redimension startLoad = timeit.default_timer() sdb_statements.LoadOneDimensionalArray(-1, "LoadArray", loadAttribute, 1, 'pdataset.scidb') startRedimension = timeit.default_timer() sdb_statements.InsertRedimension( "LoadArray", nodeLoopIteration[0][1]["destination_array"], oldvalue=loadAttribute.split(":")[0], newvalue='value') sdb.query("remove(LoadArray)") RemoveArrayVersions(sdb, nodeLoopIteration[0][1]["destination_array"]) stop = timeit.default_timer() if l == 0: print( "Estimated time for loading the dataset in minutes %s: LoadTime: %s seconds, RedimensionTime: " "%s seconds" % ((stop - start) * loadLoops / 60, startRedimension - startLoad, stop - startRedimension)) except: print("Something went wrong")