def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: ds = request.get_dataset()[0] except: raise NexusProcessingException(reason="'ds' argument is required", code=400) try: longitude = float(request.get_decimal_arg("longitude", default=None)) except: raise NexusProcessingException(reason="'longitude' argument is required", code=400) try: latitude = float(request.get_decimal_arg("latitude", default=None)) except: raise NexusProcessingException(reason="'latitude' argument is required", code=400) search_datetime = request.get_datetime_arg('date', default=None) day_of_year = request.get_int_arg('day', default=None) if (search_datetime is not None and day_of_year is not None) \ or (search_datetime is None and day_of_year is None): raise NexusProcessingException( reason="At least one of 'day' or 'date' arguments are required but not both.", code=400) if search_datetime is not None: day_of_year = search_datetime.timetuple().tm_yday return_all = request.get_boolean_arg("allInTile", default=True) return ds, longitude, latitude, day_of_year, return_all
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: ds = request.get_dataset()[0] except: raise NexusProcessingException(reason="'ds' argument is required", code=400) try: bounding_polygon = box(request.get_min_lon(), request.get_min_lat(), request.get_max_lon(), request.get_max_lat()) except: raise NexusProcessingException( reason="'minLon', 'minLat', 'maxLon', and 'maxLat' arguments are required.", code=400) try: start_time = request.get_start_datetime() except: raise NexusProcessingException( reason="'startTime' argument is required. Can be int value milliseconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() except: raise NexusProcessingException( reason="'endTime' argument is required. Can be int value milliseconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) start_seconds_from_epoch = long((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = long((end_time - EPOCH).total_seconds()) return ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: bounding_polygon = request.get_bounding_polygon() except: try: minLat = request.get_min_lat() maxLat = request.get_max_lat() minLon = request.get_min_lon() maxLon = request.get_max_lon() bounding_polygon = Polygon([ (minLon, minLat), # (west, south) (maxLon, minLat), # (east, south) (maxLon, maxLat), # (east, north) (minLon, maxLat), # (west, north) (minLon, minLat) ]) # (west, south) except: raise NexusProcessingException( reason= "'b' argument or 'minLon', 'minLat', 'maxLon', and 'maxLat' arguments are required. If 'b' is used, it must be comma-delimited float formatted as Minimum (Western) Longitude, Minimum (Southern) Latitude, Maximum (Eastern) Longitude, Maximum (Northern) Latitude", code=400) dataset = request.get_argument('dataset', None) if dataset is None: dataset = request.get_argument('ds1', None) if dataset is None: raise NexusProcessingException( reason="'dataset' or 'ds1' argument is required", code=400) climatology = request.get_argument('climatology', None) if climatology is None: climatology = request.get_argument('ds2', None) if climatology is None: raise NexusProcessingException( reason="'climatology' or 'ds2' argument is required", code=400) try: start_time = request.get_start_datetime() except: raise NexusProcessingException( reason= "'startTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() except: raise NexusProcessingException( reason= "'endTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) start_seconds_from_epoch = long((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = long((end_time - EPOCH).total_seconds()) plot = request.get_boolean_arg("plot", default=False) return bounding_polygon, dataset, climatology, start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, plot
def calc(self, computeOptions, **args): tiles = self._get_tile_service().get_tiles_bounded_by_box(computeOptions.get_min_lat(), computeOptions.get_max_lat(), computeOptions.get_min_lon(), computeOptions.get_max_lon(), computeOptions.get_dataset()[0], computeOptions.get_start_time(), computeOptions.get_end_time()) if len(tiles) == 0: raise NexusProcessingException.NoDataException(reason="No data found for selected timeframe") maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = LongitudeHofMoellerCalculator() for x, tile in enumerate(tiles): result = calculator.longitude_time_hofmoeller_stats(tile, x) results.append(result) else: manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for x, tile in enumerate(tiles): work_queue.put( ('longitude_time_hofmoeller_stats', tile, x)) [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (LONGITUDE, work_queue, done_queue)) for _ in range(0, maxprocesses)] pool.close() # Collect the results for x, tile in enumerate(tiles): result = done_queue.get() try: error_str = result['error'] logger.error(error_str) raise NexusProcessingException(reason="Error calculating longitude_time_hofmoeller_stats.") except KeyError: pass results.append(result) pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) results = self.applyDeseasonToHofMoeller(results, pivot="lons") result = HoffMoellerResults(results=results, computeOptions=computeOptions, type=HoffMoellerResults.LONGITUDE) return result
def do_get(self, request): instance = self.__clazz.instance(algorithm_config=self.__algorithm_config, sc=self.__sc) results = instance.calc(request) try: self.set_status(results.status_code) except AttributeError: pass if request.get_content_type() == ContentTypes.JSON: self.set_header("Content-Type", "application/json") try: self.write(results.toJson()) except AttributeError: traceback.print_exc(file=sys.stdout) self.write(json.dumps(results, indent=4)) elif request.get_content_type() == ContentTypes.PNG: self.set_header("Content-Type", "image/png") try: self.write(results.toImage()) except AttributeError: traceback.print_exc(file=sys.stdout) raise NexusProcessingException(reason="Unable to convert results to an Image.") elif request.get_content_type() == ContentTypes.CSV: self.set_header("Content-Type", "text/csv") self.set_header("Content-Disposition", "filename=\"%s\"" % request.get_argument('filename', "download.csv")) try: self.write(results.toCSV()) except: traceback.print_exc(file=sys.stdout) raise NexusProcessingException(reason="Unable to convert results to CSV.") elif request.get_content_type() == ContentTypes.NETCDF: self.set_header("Content-Type", "application/x-netcdf") self.set_header("Content-Disposition", "filename=\"%s\"" % request.get_argument('filename', "download.nc")) try: self.write(results.toNetCDF()) except: traceback.print_exc(file=sys.stdout) raise NexusProcessingException(reason="Unable to convert results to NetCDF.") elif request.get_content_type() == ContentTypes.ZIP: self.set_header("Content-Type", "application/zip") self.set_header("Content-Disposition", "filename=\"%s\"" % request.get_argument('filename', "download.zip")) try: self.write(results.toZip()) except: traceback.print_exc(file=sys.stdout) raise NexusProcessingException(reason="Unable to convert results to Zip.") return results
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") source_name = request.get_argument('source', None) if source_name is None or source_name.strip() == '': raise NexusProcessingException(reason="'source' argument is required", code=400) parameter_s = request.get_argument('parameter', None) if parameter_s not in ['sst', 'sss', 'wind', None]: raise NexusProcessingException( reason="Parameter %s not supported. Must be one of 'sst', 'sss', 'wind'." % parameter_s, code=400) try: start_time = request.get_start_datetime() start_time = start_time.strftime("%Y-%m-%dT%H:%M:%SZ") except: raise NexusProcessingException( reason="'startTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") except: raise NexusProcessingException( reason="'endTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) if start_time > end_time: raise NexusProcessingException( reason="The starting time must be before the ending time. Received startTime: %s, endTime: %s" % ( request.get_start_datetime().strftime(ISO_8601), request.get_end_datetime().strftime(ISO_8601)), code=400) try: bounding_polygon = request.get_bounding_polygon() except: raise NexusProcessingException( reason="'b' argument is required. Must be comma-delimited float formatted as Minimum (Western) Longitude, Minimum (Southern) Latitude, Maximum (Eastern) Longitude, Maximum (Northern) Latitude", code=400) depth_min = request.get_decimal_arg('depthMin', default=None) depth_max = request.get_decimal_arg('depthMax', default=None) if depth_min is not None and depth_max is not None and depth_min >= depth_max: raise NexusProcessingException( reason="Depth Min should be less than Depth Max", code=400) platforms = request.get_argument('platforms', None) if platforms is not None: try: p_validation = platforms.split(',') p_validation = [int(p) for p in p_validation] del p_validation except: raise NexusProcessingException(reason="platforms must be a comma-delimited list of integers", code=400) return source_name, parameter_s, start_time, end_time, bounding_polygon, depth_min, depth_max, platforms
def calc(self, computeOptions, **args): nexus_tiles_spark = [(tile.tile_id, x, computeOptions.get_min_lat(), computeOptions.get_max_lat(), computeOptions.get_min_lon(), computeOptions.get_max_lon()) for x, tile in enumerate( self._tile_service.find_tiles_in_box( computeOptions.get_min_lat(), computeOptions.get_max_lat(), computeOptions.get_min_lon(), computeOptions.get_max_lon(), computeOptions.get_dataset()[0], computeOptions.get_start_time(), computeOptions.get_end_time(), fetch_data=False))] if len(nexus_tiles_spark) == 0: raise NexusProcessingException.NoDataException( reason="No data found for selected timeframe") # Parallelize list of tile ids rdd = self._sc.parallelize( nexus_tiles_spark, determine_parllelism(len(nexus_tiles_spark))) results = rdd.map(LongitudeHofMoellerCalculator. longitude_time_hofmoeller_stats).collect() results = filter(None, results) results = sorted(results, key=lambda entry: entry["time"]) results = self.applyDeseasonToHofMoeller(results, pivot="lons") result = HoffMoellerResults(results=results, computeOptions=computeOptions, type=HoffMoellerResults.LONGITUDE) return result
def render(self, tornado_handler, result): tornado_handler.set_header("Content-Type", "application/x-netcdf") tornado_handler.set_header("Content-Disposition", "filename=\"%s\"" % self._request.get_argument('filename', "download.nc")) try: self.write(result.toNetCDF()) except: traceback.print_exc(file=sys.stdout) raise NexusProcessingException(reason="Unable to convert results to NetCDF.")
def calc(self, request, **args): bounding_polygon, dataset, climatology, start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, plot = self.parse_arguments( request) self.log.debug("Querying for tiles in search domain") # Get tile ids in box tile_ids = [ tile.tile_id for tile in self._get_tile_service().find_tiles_in_polygon( bounding_polygon, dataset, start_seconds_from_epoch, end_seconds_from_epoch, fetch_data=False, fl='id', sort=[ 'tile_min_time_dt asc', 'tile_min_lon asc', 'tile_min_lat asc' ], rows=5000) ] # Call spark_matchup try: spark_result = spark_anomalies_driver(self._tile_service_factory, tile_ids, wkt.dumps(bounding_polygon), dataset, climatology, sc=self._sc) except Exception as e: self.log.exception(e) raise NexusProcessingException( reason= "An unknown error occurred while computing average differences", code=500) average_and_std_by_day = spark_result min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds result = DDAResult(results=[[{ 'time': dayms, 'mean': avg_std[0], 'std': avg_std[1], 'ds': 0 }] for dayms, avg_std in average_and_std_by_day], stats={}, meta=self.get_meta(dataset), computeOptions=None, minLat=min_lat, maxLat=max_lat, minLon=min_lon, maxLon=max_lon, ds=dataset, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) result.meta()['climatology'] = climatology return result
def render(self, tornado_handler, result): tornado_handler.set_header("Content-Type", "image/png") try: tornado_handler.write(result.toImage()) tornado_handler.finish() except AttributeError: traceback.print_exc(file=sys.stdout) raise NexusProcessingException( reason="Unable to convert results to an Image.")
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: ds = request.get_dataset() if type(ds) == list or type(ds) == tuple: ds = next(iter(ds)) except: raise NexusProcessingException( reason="'ds' argument is required. Must be a string", code=400) # Do not allow time series on Climatology if next(iter([clim for clim in ds if 'CLIM' in clim]), False): raise NexusProcessingException( reason= "Cannot compute Latitude/Longitude Time Average plot on a climatology", code=400) west, south, east, north = request.get_bounding_box() bounding_polygon = shapely.geometry.Polygon([(west, south), (east, south), (east, north), (west, north), (west, south)]) start_time = request.get_start_datetime() end_time = request.get_end_datetime() if start_time > end_time: raise NexusProcessingException( reason= "The starting time must be before the ending time. Received startTime: %s, endTime: %s" % (request.get_start_datetime().strftime(ISO_8601), request.get_end_datetime().strftime(ISO_8601)), code=400) nparts_requested = request.get_nparts() start_seconds_from_epoch = long((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = long((end_time - EPOCH).total_seconds()) return ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, nparts_requested
def wrapped(*args, **kwargs1): try: with SparkHandler.SparkJobContext(self.spark_job_stack) as job_context: # TODO Pool and Job are forced to a 1-to-1 relationship calc_func.im_self._sc.setLocalProperty("spark.scheduler.pool", job_context.job_name) calc_func.im_self._sc.setJobGroup(job_context.job_name, "a spark job") return calc_func(*args, **kwargs1) except SparkHandler.SparkJobContext.MaxConcurrentJobsReached: raise NexusProcessingException(code=503, reason="Max concurrent requests reached. Please try again later.")
def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=True, applyLowPass=True): daysinrange = self._tile_service.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(min_lat, max_lat, min_lon, max_lon, ds, dayinseconds) results.append(result) else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', min_lat, max_lat, min_lon, max_lon, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in xrange(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] self.log.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results.append(result) pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) return results, {}
def render(self, tornado_handler, result): tornado_handler.set_header("Content-Type", "text/csv") tornado_handler.set_header( "Content-Disposition", "filename=\"%s\"" % self._request.get_argument('filename', "download.csv")) try: tornado_handler.write(result.toCSV()) tornado_handler.finish() except: traceback.print_exc(file=sys.stdout) raise NexusProcessingException( reason="Unable to convert results to CSV.")
def get_daily_difference_average_for_box(self, min_lat, max_lat, min_lon, max_lon, dataset1, dataset2, start_time, end_time): daysinrange = self._tile_service.find_days_in_range_asc( min_lat, max_lat, min_lon, max_lon, dataset1, start_time, end_time) maxprocesses = int( self.algorithm_config.get("multiprocessing", "maxprocesses")) if maxprocesses == 1: calculator = DailyDifferenceAverageCalculator() averagebyday = [] for dayinseconds in daysinrange: result = calculator.calc_average_diff_on_day( min_lat, max_lat, min_lon, max_lon, dataset1, dataset2, dayinseconds) averagebyday.append((result[0], result[1])) else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_diff_on_day', min_lat, max_lat, min_lon, max_lon, dataset1, dataset2, dayinseconds)) [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [ pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses) ] pool.close() # Collect the results as [(day (in ms), average difference for that day)] averagebyday = [] for i in xrange(0, len(daysinrange)): result = done_queue.get() if result[0] == 'error': print >> sys.stderr, result[1] raise NexusProcessingException( reason="Error calculating average by day.") rdata = result averagebyday.append((rdata[0], rdata[1])) pool.terminate() manager.shutdown() return averagebyday
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: ds = request.get_dataset()[0] except: raise NexusProcessingException(reason="'ds' argument is required", code=400) parameter_s = request.get_argument('parameter', None) if parameter_s not in ['sst', 'sss', 'wind', None]: raise NexusProcessingException( reason= "Parameter %s not supported. Must be one of 'sst', 'sss', 'wind'." % parameter_s, code=400) try: start_time = request.get_start_datetime() start_time = long((start_time - EPOCH).total_seconds()) except: raise NexusProcessingException( reason= "'startTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() end_time = long((end_time - EPOCH).total_seconds()) except: raise NexusProcessingException( reason= "'endTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) if start_time > end_time: raise NexusProcessingException( reason= "The starting time must be before the ending time. Received startTime: %s, endTime: %s" % (request.get_start_datetime().strftime(ISO_8601), request.get_end_datetime().strftime(ISO_8601)), code=400) bounding_polygon = metadata_filter = None try: bounding_polygon = request.get_bounding_polygon() except: metadata_filter = request.get_metadata_filter() if 0 == len(metadata_filter): raise NexusProcessingException( reason= "'b' or 'metadataFilter' argument is required. 'b' must be comma-delimited float formatted " "as Minimum (Western) Longitude, Minimum (Southern) Latitude, Maximum (Eastern) Longitude, " "Maximum (Northern) Latitude. 'metadataFilter' must be in the form key:value", code=400) return ds, parameter_s, start_time, end_time, bounding_polygon, metadata_filter
def __doQuery(endpoint, startTime, endTime, bbox, depth_min=None, depth_max=None, itemsPerPage=10, startIndex=0, platforms=None, pageCallback=None): params = {"startTime": startTime, "endTime": endTime, "bbox": bbox, "itemsPerPage": itemsPerPage, "startIndex": startIndex, "stats": "true"} if depth_min is not None: params['minDepth'] = depth_min if depth_max is not None: params['maxDepth'] = depth_max if platforms is not None: params["platform"] = platforms.split(",") resultsRaw = __fetchJson(endpoint["url"], params) boundsConstrainer = geo.BoundsConstrainer(north=-90, south=90, west=180, east=-180) if resultsRaw["totalResults"] == 0 or len(resultsRaw["results"]) == 0: # Double-sanity check return [], resultsRaw["totalResults"], startIndex, itemsPerPage, boundsConstrainer try: results = [] for resultdict in resultsRaw["results"]: result = __resultRawToUsable(resultdict) result["source"] = endpoint["name"] boundsConstrainer.testCoords(north=result["y"], south=result["y"], west=result["x"], east=result["x"]) results.append(result) if "stats_fields" in resultsRaw and len(resultsRaw["results"]) == 0: stats = resultsRaw["stats_fields"] if "lat" in stats and "lon" in stats: boundsConstrainer.testCoords(north=stats['lat']['max'], south=stats['lat']['min'], west=stats['lon']['min'], east=stats['lon']['max']) if pageCallback is not None: pageCallback(results) ''' If pageCallback was supplied, we assume this call to be asynchronous. Otherwise combine all the results data and return it. ''' if pageCallback is None: return results, int(resultsRaw["totalResults"]), int(resultsRaw["startIndex"]), int( resultsRaw["itemsPerPage"]), boundsConstrainer else: return [], int(resultsRaw["totalResults"]), int(resultsRaw["startIndex"]), int( resultsRaw["itemsPerPage"]), boundsConstrainer except: print "Invalid or missing JSON in response." traceback.print_exc() raise NexusProcessingException(reason="Invalid or missing JSON in response.")
def calc(self, computeOptions, **args): execution_id = computeOptions.get_argument("id", None) try: execution_id = uuid.UUID(execution_id) except: raise NexusProcessingException(reason="'id' argument must be a valid uuid", code=400) simple_results = computeOptions.get_boolean_arg("simpleResults", default=False) with ResultsStorage.ResultsRetrieval() as storage: params, stats, data = storage.retrieveResults(execution_id, trim_data=simple_results) return BaseDomsHandler.DomsQueryResults(results=data, args=params, details=stats, bounds=None, count=None, computeOptions=None, executionId=execution_id)
def calc(self, computeOptions, **args): tiles = self._tile_service.get_tiles_bounded_by_box(computeOptions.get_min_lat(), computeOptions.get_max_lat(), computeOptions.get_min_lon(), computeOptions.get_max_lon(), computeOptions.get_dataset()[0], computeOptions.get_start_time(), computeOptions.get_end_time()) if len(tiles) == 0: raise NexusProcessingException.NoDataException(reason="No data found for selected timeframe") maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) pool = ThreadPool(processes=maxprocesses) results = [pool.apply_async(longitude_time_hofmoeller_stats, args=(tile, x)) for x, tile in enumerate(tiles)] pool.close() pool.join() results = [p.get() for p in results] results = sorted(results, key=lambda entry: entry["time"]) results = self.applyDeseasonToHofMoeller(results, pivot="lons") result = HoffMoellerResults(results=results, compute_options=computeOptions, type=HoffMoellerResults.LONGITUDE) return result
def calc(self, computeOptions, **args): minLat = computeOptions.get_min_lat() maxLat = computeOptions.get_max_lat() minLon = computeOptions.get_min_lon() maxLon = computeOptions.get_max_lon() ds = computeOptions.get_dataset() startTime = computeOptions.get_start_time() endTime = computeOptions.get_end_time() resolution = computeOptions.get_decimal_arg("res", default=1.0) if not len(ds) == 2: raise Exception("Requires two datasets for comparison. Specify request parameter ds=Dataset_1,Dataset_2") ds1tiles = self._tile_service.find_tiles_in_polygon(box(minLon, minLat, maxLon, maxLat), ds[0], startTime, endTime) ds2tiles = self._tile_service.find_tiles_in_polygon(box(minLon, minLat, maxLon, maxLat), ds[1], startTime, endTime) matches = self._match_tiles(ds1tiles, ds2tiles) if len(matches) == 0: raise NexusProcessingException(reason="Could not find any data temporally co-located") results = [[{ 'cnt': 0, 'slope': 0, 'intercept': 0, 'r': 0, 'p': 0, 'stderr': 0, 'lat': float(lat), 'lon': float(lon) } for lon in np.arange(minLon, maxLon, resolution)] for lat in np.arange(minLat, maxLat, resolution)] for stats in results: for stat in stats: values_x = [] values_y = [] for tile_matches in matches: tile_1_list = tile_matches[0] value_1 = get_approximate_value_for_lat_lon(tile_1_list, stat["lat"], stat["lon"]) tile_2_list = tile_matches[1] value_2 = get_approximate_value_for_lat_lon(tile_2_list, stat["lat"], stat["lon"]) if not (math.isnan(value_1) or math.isnan(value_2)): values_x.append(value_1) values_y.append(value_2) if len(values_x) > 2 and len(values_y) > 2: stats = linregress(values_x, values_y) stat["slope"] = stats[0] if not math.isnan(stats[0]) and not math.isinf(stats[0]) else str(stats[0]) stat["intercept"] = stats[1] if not math.isnan(stats[1]) and not math.isinf(stats[1]) else str( stats[1]) stat["r"] = stats[2] if not math.isnan(stats[2]) and not math.isinf(stats[2]) else str(stats[2]) stat["p"] = stats[3] if not math.isnan(stats[3]) and not math.isinf(stats[3]) else str(stats[3]) stat["stderr"] = stats[4] if not math.isnan(stats[4]) and not math.isinf(stats[4]) else str( stats[4]) stat["cnt"] = len(values_x) return CorrelationResults(results)
def calc(self, computeOptions, **args): self._setQueryParams(computeOptions.get_dataset(), (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time()) print 'ds = ',self._ds if not len(self._ds) == 2: raise Exception("Requires two datasets for comparison. Specify request parameter ds=Dataset_1,Dataset_2") self._find_native_resolution() print 'Using Native resolution: lat_res=%f, lon_res=%f' % (self._latRes, self._lonRes) self._minLatCent = self._minLat + self._latRes / 2 self._minLonCent = self._minLon + self._lonRes / 2 nlats = int((self._maxLat-self._minLatCent)/self._latRes)+1 nlons = int((self._maxLon-self._minLonCent)/self._lonRes)+1 self._maxLatCent = self._minLatCent + (nlats-1) * self._latRes self._maxLonCent = self._minLonCent + (nlons-1) * self._lonRes print 'nlats=',nlats,'nlons=',nlons sys.stdout.flush() nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NexusProcessingException.NoDataException(reason="No data found for selected timeframe") print 'Initially found %d tiles' % len(nexus_tiles) sys.stdout.flush() self._prune_tiles(nexus_tiles) print 'Pruned to %d tiles' % len(nexus_tiles) sys.stdout.flush() # Create array of tuples to pass to Spark map function nexus_tile_specs = [[self._find_tile_bounds(t), self._startTime, self._endTime, self._ds] for t in nexus_tiles] # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tile_specs])[0] for i in np.flipud(bad_tile_inds): del nexus_tile_specs[i] # Configure Spark sp_conf = SparkConf() sp_conf.setAppName("Spark Correlation Map") sp_conf.set("spark.executorEnv.HOME", os.path.join(os.getenv('HOME'), 'spark_exec_home')) sp_conf.set("spark.executorEnv.PYTHONPATH", os.getcwd()) sp_conf.set("spark.executor.memoryOverhead", "4g") #num_parts = 1 num_parts = 16 #num_parts = 64 #num_parts = 128 #num_execs = 1 #num_execs = 8 num_execs = 16 #num_execs = 64 cores_per_exec = 1 sp_conf.setMaster("yarn-client") #sp_conf.setMaster("local[16]") #sp_conf.setMaster("local[1]") sp_conf.set("spark.executor.instances", num_execs) sp_conf.set("spark.executor.cores", cores_per_exec) #print sp_conf.getAll() sc = SparkContext(conf=sp_conf) # Launch Spark computations rdd = sc.parallelize(nexus_tile_specs,num_parts) corr_tiles = rdd.map(self._map).collect() r = np.zeros((nlats, nlons),dtype=np.float64,order='C') # The tiles below are NOT Nexus objects. They are tuples # with the correlation map subset lat-lon bounding box. for tile in corr_tiles: (tile_stats, tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon) = tile tile_data = np.ma.array([[tile_stats[y][x]['r'] for x in range(len(tile_stats[0]))] for y in range(len(tile_stats))]) tile_cnt = np.array([[tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0]))] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = self._lat2ind(tile_max_lat) x0 = self._lon2ind(tile_min_lon) x1 = self._lon2ind(tile_max_lon) if np.any(np.logical_not(tile_data.mask)): print 'writing tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() r[y0:y1+1,x0:x1+1] = tile_data else: print 'All pixels masked in tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() # Store global map in a NetCDF file. self._create_nc_file(r, 'corrmap.nc', 'r') return [[]], None, None
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: bounding_polygon = request.get_bounding_polygon() except: raise NexusProcessingException( reason= "'b' argument is required. Must be comma-delimited float formatted as Minimum (Western) Longitude, Minimum (Southern) Latitude, Maximum (Eastern) Longitude, Maximum (Northern) Latitude", code=400) primary_ds_name = request.get_argument('primary', None) if primary_ds_name is None: raise NexusProcessingException( reason="'primary' argument is required", code=400) matchup_ds_names = request.get_argument('matchup', None) if matchup_ds_names is None: raise NexusProcessingException( reason="'matchup' argument is required", code=400) parameter_s = request.get_argument('parameter', 'sst') if parameter_s not in ['sst', 'sss', 'wind']: raise NexusProcessingException( reason= "Parameter %s not supported. Must be one of 'sst', 'sss', 'wind'." % parameter_s, code=400) try: start_time = request.get_start_datetime() except: raise NexusProcessingException( reason= "'startTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() except: raise NexusProcessingException( reason= "'endTime' argument is required. Can be int value seconds from epoch or string format YYYY-MM-DDTHH:mm:ssZ", code=400) if start_time > end_time: raise NexusProcessingException( reason= "The starting time must be before the ending time. Received startTime: %s, endTime: %s" % (request.get_start_datetime().strftime(ISO_8601), request.get_end_datetime().strftime(ISO_8601)), code=400) depth_min = request.get_decimal_arg('depthMin', default=None) depth_max = request.get_decimal_arg('depthMax', default=None) if depth_min is not None and depth_max is not None and depth_min >= depth_max: raise NexusProcessingException( reason="Depth Min should be less than Depth Max", code=400) time_tolerance = request.get_int_arg('tt', default=86400) radius_tolerance = request.get_decimal_arg('rt', default=1000.0) platforms = request.get_argument('platforms', None) if platforms is None: raise NexusProcessingException( reason="'platforms' argument is required", code=400) try: p_validation = platforms.split(',') p_validation = [int(p) for p in p_validation] del p_validation except: raise NexusProcessingException( reason="platforms must be a comma-delimited list of integers", code=400) match_once = request.get_boolean_arg("matchOnce", default=False) result_size_limit = request.get_int_arg("resultSizeLimit", default=500) start_seconds_from_epoch = long((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = long((end_time - EPOCH).total_seconds()) return bounding_polygon, primary_ds_name, matchup_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ platforms, match_once, result_size_limit
def parse_arguments(self, request): # Parse input arguments self.log.debug("Parsing arguments") try: ds = request.get_dataset() if type(ds) == list or type(ds) == tuple: ds = next(iter(ds)) except: raise NexusProcessingException( reason="'ds' argument is required. Must be a string", code=400) # Do not allow time series on Climatology if next(iter([clim for clim in ds if 'CLIM' in clim]), False): raise NexusProcessingException( reason= "Cannot compute Latitude/Longitude Time Average plot on a climatology", code=400) try: bounding_polygon = request.get_bounding_polygon() request.get_min_lon = lambda: bounding_polygon.bounds[0] request.get_min_lat = lambda: bounding_polygon.bounds[1] request.get_max_lon = lambda: bounding_polygon.bounds[2] request.get_max_lat = lambda: bounding_polygon.bounds[3] except: try: west, south, east, north = request.get_min_lon(), request.get_min_lat(), \ request.get_max_lon(), request.get_max_lat() bounding_polygon = shapely.geometry.Polygon([(west, south), (east, south), (east, north), (west, north), (west, south)]) except: raise NexusProcessingException( reason= "'b' argument is required. Must be comma-delimited float formatted as " "Minimum (Western) Longitude, Minimum (Southern) Latitude, " "Maximum (Eastern) Longitude, Maximum (Northern) Latitude", code=400) try: start_time = request.get_start_datetime() except: raise NexusProcessingException( reason= "'startTime' argument is required. Can be int value seconds from epoch or " "string format YYYY-MM-DDTHH:mm:ssZ", code=400) try: end_time = request.get_end_datetime() except: raise NexusProcessingException( reason= "'endTime' argument is required. Can be int value seconds from epoch or " "string format YYYY-MM-DDTHH:mm:ssZ", code=400) if start_time > end_time: raise NexusProcessingException( reason= "The starting time must be before the ending time. Received startTime: %s, endTime: %s" % (request.get_start_datetime().strftime(ISO_8601), request.get_end_datetime().strftime(ISO_8601)), code=400) start_seconds_from_epoch = int((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = int((end_time - EPOCH).total_seconds()) normalize_dates = request.get_normalize_dates() return ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, normalize_dates
def calc(self, request, **args): start = datetime.utcnow() # TODO Assuming Satellite primary bounding_polygon, primary_ds_name, matchup_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ platforms, match_once, result_size_limit = self.parse_arguments(request) with ResultsStorage() as resultsStorage: execution_id = str( resultsStorage.insertExecution(None, start, None, None)) self.log.debug("Querying for tiles in search domain") # Get tile ids in box tile_ids = [ tile.tile_id for tile in self._tile_service.find_tiles_in_polygon( bounding_polygon, primary_ds_name, start_seconds_from_epoch, end_seconds_from_epoch, fetch_data=False, fl='id', sort=[ 'tile_min_time_dt asc', 'tile_min_lon asc', 'tile_min_lat asc' ], rows=5000) ] # Call spark_matchup self.log.debug("Calling Spark Driver") try: spark_result = spark_matchup_driver(tile_ids, wkt.dumps(bounding_polygon), primary_ds_name, matchup_ds_names, parameter_s, depth_min, depth_max, time_tolerance, radius_tolerance, platforms, match_once, sc=self._sc) except Exception as e: self.log.exception(e) raise NexusProcessingException( reason="An unknown error occurred while computing matches", code=500) end = datetime.utcnow() self.log.debug("Building and saving results") args = { "primary": primary_ds_name, "matchup": matchup_ds_names, "startTime": start_time, "endTime": end_time, "bbox": request.get_argument('b'), "timeTolerance": time_tolerance, "radiusTolerance": float(radius_tolerance), "platforms": platforms, "parameter": parameter_s } if depth_min is not None: args["depthMin"] = float(depth_min) if depth_max is not None: args["depthMax"] = float(depth_max) total_keys = len(spark_result.keys()) total_values = sum(len(v) for v in spark_result.itervalues()) details = { "timeToComplete": int((end - start).total_seconds()), "numInSituRecords": 0, "numInSituMatched": total_values, "numGriddedChecked": 0, "numGriddedMatched": total_keys } matches = Matchup.convert_to_matches(spark_result) def do_result_insert(): with ResultsStorage() as storage: storage.insertResults(results=matches, params=args, stats=details, startTime=start, completeTime=end, userEmail="", execution_id=execution_id) threading.Thread(target=do_result_insert).start() if 0 < result_size_limit < len(matches): result = DomsQueryResults(results=None, args=args, details=details, bounds=None, count=None, computeOptions=None, executionId=execution_id, status_code=202) else: result = DomsQueryResults(results=matches, args=args, details=details, bounds=None, count=None, computeOptions=None, executionId=execution_id) return result
def getTimeSeriesStatsForBoxSingleDataSet(self, bounding_polygon, ds, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter=True, apply_low_pass_filter=True): the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc(bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], ds, start_seconds_from_epoch, end_seconds_from_epoch) logger.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") the_time = datetime.now() maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(bounding_polygon.wkt, ds, dayinseconds) results += [result] if result else [] else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', bounding_polygon.wkt, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in range(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in range(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] logger.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results += [result] if result else [] pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) logger.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if apply_seasonal_cycle_filter: the_time = datetime.now() for result in results: month = datetime.utcfromtimestamp(result['time']).month month_mean, month_max, month_min = self.calculate_monthly_average(month, bounding_polygon.wkt, ds) seasonal_mean = result['mean'] - month_mean seasonal_min = result['min'] - month_min seasonal_max = result['max'] - month_max result['meanSeasonal'] = seasonal_mean result['minSeasonal'] = seasonal_min result['maxSeasonal'] = seasonal_max logger.info( "Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) the_time = datetime.now() filtering.applyAllFiltersOnField(results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() logger.warn("Error calculating SeasonalLowPass filter:\n%s" % tb) logger.info( "LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) return results, {}
def calc(self, request, **args): """ :param request: StatsComputeOptions :param args: dict :return: """ start_time = datetime.now() ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested, normalize_dates = self.parse_arguments( request) metrics_record = self._create_metrics_record() resultsRaw = [] for shortName in ds: the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName, start_seconds_from_epoch, end_seconds_from_epoch, metrics_callback=metrics_record.record_metrics) self.log.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) results, meta = spark_driver(daysinrange, bounding_polygon, shortName, self._tile_service_factory, metrics_record.record_metrics, normalize_dates, spark_nparts=spark_nparts, sc=self._sc) if apply_seasonal_cycle_filter: the_time = datetime.now() # get time series for _clim dataset shortName_clim = shortName + "_clim" daysinrange_clim = self._get_tile_service( ).find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName_clim, 0, SECONDS_IN_ONE_YEAR, metrics_callback=metrics_record.record_metrics) if len(daysinrange_clim) == 0: raise NexusProcessingException( reason= "There is no climatology data present for dataset " + shortName + ".") results_clim, _ = spark_driver(daysinrange_clim, bounding_polygon, shortName_clim, self._tile_service_factory, metrics_record.record_metrics, normalize_dates=False, spark_nparts=spark_nparts, sc=self._sc) clim_indexed_by_month = { datetime.utcfromtimestamp(result['time']).month: result for result in results_clim } if len(clim_indexed_by_month) < 12: raise NexusProcessingException( reason="There are only " + len(clim_indexed_by_month) + " months of climatology data for dataset " + shortName + ". A full year of climatology data is required for computing deseasoned timeseries." ) for result in results: month = datetime.utcfromtimestamp(result['time']).month result['meanSeasonal'] = result[ 'mean'] - clim_indexed_by_month[month]['mean'] result['minSeasonal'] = result[ 'min'] - clim_indexed_by_month[month]['min'] result['maxSeasonal'] = result[ 'max'] - clim_indexed_by_month[month]['max'] self.log.info("Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() filtering.applyAllFiltersOnField( results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() self.log.warn( "Error calculating SeasonalLowPass filter:\n%s" % tb) resultsRaw.append([results, meta]) self.log.info("LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) self.log.info("NetCDF generation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = TimeSeriesSparkHandlerImpl.calculate_comparison_stats( results) except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=None, minLat=bounding_polygon.bounds[1], maxLat=bounding_polygon.bounds[3], minLon=bounding_polygon.bounds[0], maxLon=bounding_polygon.bounds[2], ds=ds, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) total_duration = (datetime.now() - start_time).total_seconds() metrics_record.record_metrics(actual_time=total_duration) metrics_record.print_metrics(logger) self.log.info("Merging results and calculating comparisons took %s" % (str(datetime.now() - the_time))) return res
def calc(self, computeOptions, **args): spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg( ) self._setQueryParams(computeOptions.get_dataset(), (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time(), spark_master=spark_master, spark_nexecs=spark_nexecs, spark_nparts=spark_nparts) self.log.debug('ds = {0}'.format(self._ds)) if not len(self._ds) == 2: raise NexusProcessingException( reason= "Requires two datasets for comparison. Specify request parameter ds=Dataset_1,Dataset_2", code=400) if next(iter([clim for clim in self._ds if 'CLIM' in clim]), False): raise NexusProcessingException( reason="Cannot compute correlation on a climatology", code=400) nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons)) # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. num_time_parts = 72 # num_time_parts = 2 # num_time_parts = 1 nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format( len(nexus_tiles_spark))) # Set the time boundaries for each of the Spark map tuples. # Every Nth element in the array gets the same time bounds. spark_part_times = np.linspace(self._startTime, self._endTime + 1, num_time_parts + 1, dtype=np.int64) spark_part_time_ranges = \ np.repeat([[[spark_part_times[i], spark_part_times[i + 1] - 1] for i in range(num_time_parts)]], len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2)) self.log.debug( 'spark_part_time_ranges={0}'.format(spark_part_time_ranges)) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # print 'nexus_tiles_spark final = ' # for i in range(len(nexus_tiles_spark)): # print nexus_tiles_spark[i] # Launch Spark computations # print 'nexus_tiles_spark=',nexus_tiles_spark rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts) sum_tiles_part = rdd.map(self._map) # print "sum_tiles_part = ",sum_tiles_part.collect() sum_tiles = \ sum_tiles_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1], x[2] + val[2], x[3] + val[3], x[4] + val[4], x[5] + val[5]), lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3], x[4] + y[4], x[5] + y[5])) # Convert the N (pixel-wise count) array for each tile to be a # NumPy masked array. That is the last array in the tuple of # intermediate summation arrays. Set mask to True if count is 0. sum_tiles = \ sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, n)): (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, np.ma.array(n, mask=~(n.astype(bool)))))) # print 'sum_tiles = ',sum_tiles.collect() # For each pixel in each tile compute an array of Pearson # correlation coefficients. The map function is called once # per tile. The result of this map operation is a list of 3-tuples of # (bounds, r, n) for each tile (r=Pearson correlation coefficient # and n=number of input values that went into each pixel with # any masked values not included). corr_tiles = \ sum_tiles.map(lambda (bounds, (sum_x, sum_y, sum_xx, sum_yy, sum_xy, n)): (bounds, np.ma.array(((sum_xy - sum_x * sum_y / n) / np.sqrt((sum_xx - sum_x * sum_x / n) * (sum_yy - sum_y * sum_y / n))), mask=~(n.astype(bool))), n)).collect() r = np.zeros((nlats, nlons), dtype=np.float64, order='C') n = np.zeros((nlats, nlons), dtype=np.uint32, order='C') # The tiles below are NOT Nexus objects. They are tuples # with the following for each correlation map subset: # (1) lat-lon bounding box, (2) array of correlation r values, # and (3) array of count n values. for tile in corr_tiles: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_data, tile_cnt) = tile y0 = self._lat2ind(tile_min_lat) y1 = self._lat2ind(tile_max_lat) x0 = self._lon2ind(tile_min_lon) x1 = self._lon2ind(tile_max_lon) self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) r[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt # Store global map in a NetCDF file. self._create_nc_file(r, 'corrmap.nc', 'r') # Create dict for JSON response results = [[{ 'r': r[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(r.shape[1])] for y in range(r.shape[0])] return CorrelationResults(results)
def calc(self, computeOptions, **args): """ :param computeOptions: StatsComputeOptions :param args: dict :return: """ self._setQueryParams(computeOptions.get_dataset()[0], (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time()) self._find_native_resolution() print 'Using Native resolution: lat_res=%f, lon_res=%f' % ( self._latRes, self._lonRes) self._minLatCent = self._minLat + self._latRes / 2 self._minLonCent = self._minLon + self._lonRes / 2 nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self._maxLatCent = self._minLatCent + (nlats - 1) * self._latRes self._maxLonCent = self._minLonCent + (nlons - 1) * self._lonRes print 'nlats=', nlats, 'nlons=', nlons print 'center lat range = %f to %f' % (self._minLatCent, self._maxLatCent) print 'center lon range = %f to %f' % (self._minLonCent, self._maxLonCent) sys.stdout.flush() a = np.zeros((nlats, nlons), dtype=np.float64, order='C') n = np.zeros((nlats, nlons), dtype=np.float64, order='C') nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NexusProcessingException.NoDataException( reason="No data found for selected timeframe") print 'Initially found %d tiles' % len(nexus_tiles) sys.stdout.flush() self._prune_tiles(nexus_tiles) print 'Pruned to %d tiles' % len(nexus_tiles) sys.stdout.flush() #for tile in nexus_tiles: # print 'lats: ', tile.latitudes.compressed() # print 'lons: ', tile.longitudes.compressed() # Create array of tuples to pass to Spark map function cwd = os.getcwd() nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds, cwd ] for t in nexus_tiles] #print 'nexus_tiles_spark = ', nexus_tiles_spark # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. num_time_parts = 18 #nexus_tiles_spark = list(itertools.chain.from_iterable(itertools.repeat(t, num_time_parts) for t in nexus_tiles_spark)) nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) print 'repeated len(nexus_tiles_spark) = ', len(nexus_tiles_spark) # Set the time boundaries for each of the Spark map tuples. # Every Nth element in the array gets the same time bounds. spark_part_times = np.linspace(self._startTime, self._endTime, num_time_parts + 1, dtype=np.int64) spark_part_time_ranges = \ np.repeat([[[spark_part_times[i], spark_part_times[i+1]] for i in range(num_time_parts)]], len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2)) print 'spark_part_time_ranges=', spark_part_time_ranges nexus_tiles_spark[:, 1:3] = spark_part_time_ranges print 'nexus_tiles_spark final = ' for i in range(len(nexus_tiles_spark)): print nexus_tiles_spark[i] # Configure Spark sp_conf = SparkConf() sp_conf.setAppName("Spark Time Avg Map") sp_conf.set("spark.executorEnv.HOME", os.path.join(os.getenv('HOME'), 'spark_exec_home')) sp_conf.set("spark.executorEnv.PYTHONPATH", cwd) #sp_conf.set("spark.yarn.executor.memoryOverhead", "4000") sp_conf.set("spark.executor.memory", "4g") #num_parts = 1 num_parts = 16 #num_parts = 64 #num_parts = 128 #num_execs = 1 num_execs = 16 #num_execs = 64 cores_per_exec = 1 sp_conf.setMaster("yarn-client") #sp_conf.setMaster("local[16]") #sp_conf.setMaster("local[1]") sp_conf.set("spark.executor.instances", num_execs) sp_conf.set("spark.executor.cores", cores_per_exec) #print sp_conf.getAll() sc = SparkContext(conf=sp_conf) # Launch Spark computations rdd = sc.parallelize(nexus_tiles_spark, num_parts) sum_count_part = rdd.map(self._map) sum_count = \ sum_count_part.combineByKey(lambda val: val, lambda x,val: (x[0]+val[0], x[1]+val[1]), lambda x,y: (x[0]+y[0], x[1]+y[1])) avg_tiles = \ sum_count.map(lambda (bounds, (sum_tile, cnt_tile)): (bounds, [[{'avg': (sum_tile[y,x]/cnt_tile[y,x]) if (cnt_tile[y,x] > 0) else 0., 'cnt': cnt_tile[y,x]} for x in range(sum_tile.shape[1])] for y in range(sum_tile.shape[0])])).collect() #avg_tiles = map(self._map, nexus_tiles) # Combine subset results to produce global map. # # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. for tile in avg_tiles: if tile is not None: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_stats) = tile tile_data = np.ma.array([[ tile_stats[y][x]['avg'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_cnt = np.array([[ tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = y0 + tile_data.shape[0] - 1 x0 = self._lon2ind(tile_min_lon) x1 = x0 + tile_data.shape[1] - 1 if np.any(np.logical_not(tile_data.mask)): print 'writing tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() a[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt else: print 'All pixels masked in tile lat %f-%f, lon %f-%f, map y %d-%d, map x %d-%d' % \ (tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1) sys.stdout.flush() # Store global map in a NetCDF file. self._create_nc_file(a, 'tam.nc', 'val') # Create dict for JSON response results = [[{ 'avg': a[x, y], 'cnt': n[x, y] } for x in range(a.shape[0])] for y in range(a.shape[1])] return TimeAvgMapSparkResults(results=results, meta={}, computeOptions=computeOptions)
def calc(self, computeOptions, **args): """ :param computeOptions: StatsComputeOptions :param args: dict :return: """ spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg( ) self._setQueryParams(computeOptions.get_dataset()[0], (float(computeOptions.get_min_lat()), float(computeOptions.get_max_lat()), float(computeOptions.get_min_lon()), float(computeOptions.get_max_lon())), computeOptions.get_start_time(), computeOptions.get_end_time(), spark_master=spark_master, spark_nexecs=spark_nexecs, spark_nparts=spark_nparts) if 'CLIM' in self._ds: raise NexusProcessingException( reason= "Cannot compute Latitude/Longitude Time Average plot on a climatology", code=400) nexus_tiles = self._find_global_tile_set() # print 'tiles:' # for tile in nexus_tiles: # print tile.granule # print tile.section_spec # print 'lat:', tile.latitudes # print 'lon:', tile.longitudes # nexus_tiles) if len(nexus_tiles) == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} tiles'.format(len(nexus_tiles))) self.log.debug( 'Using Native resolution: lat_res={0}, lon_res={1}'.format( self._latRes, self._lonRes)) nlats = int((self._maxLat - self._minLatCent) / self._latRes) + 1 nlons = int((self._maxLon - self._minLonCent) / self._lonRes) + 1 self.log.debug('nlats={0}, nlons={1}'.format(nlats, nlons)) self.log.debug('center lat range = {0} to {1}'.format( self._minLatCent, self._maxLatCent)) self.log.debug('center lon range = {0} to {1}'.format( self._minLonCent, self._maxLonCent)) # for tile in nexus_tiles: # print 'lats: ', tile.latitudes.compressed() # print 'lons: ', tile.longitudes.compressed() # Create array of tuples to pass to Spark map function nexus_tiles_spark = [[ self._find_tile_bounds(t), self._startTime, self._endTime, self._ds ] for t in nexus_tiles] # print 'nexus_tiles_spark = ', nexus_tiles_spark # Remove empty tiles (should have bounds set to None) bad_tile_inds = np.where([t[0] is None for t in nexus_tiles_spark])[0] for i in np.flipud(bad_tile_inds): del nexus_tiles_spark[i] # Expand Spark map tuple array by duplicating each entry N times, # where N is the number of ways we want the time dimension carved up. num_time_parts = 72 # num_time_parts = 1 nexus_tiles_spark = np.repeat(nexus_tiles_spark, num_time_parts, axis=0) self.log.debug('repeated len(nexus_tiles_spark) = {0}'.format( len(nexus_tiles_spark))) # Set the time boundaries for each of the Spark map tuples. # Every Nth element in the array gets the same time bounds. spark_part_times = np.linspace(self._startTime, self._endTime, num_time_parts + 1, dtype=np.int64) spark_part_time_ranges = \ np.repeat([[[spark_part_times[i], spark_part_times[i + 1]] for i in range(num_time_parts)]], len(nexus_tiles_spark) / num_time_parts, axis=0).reshape((len(nexus_tiles_spark), 2)) self.log.debug( 'spark_part_time_ranges={0}'.format(spark_part_time_ranges)) nexus_tiles_spark[:, 1:3] = spark_part_time_ranges # print 'nexus_tiles_spark final = ' # for i in range(len(nexus_tiles_spark)): # print nexus_tiles_spark[i] # Launch Spark computations rdd = self._sc.parallelize(nexus_tiles_spark, self._spark_nparts) sum_count_part = rdd.map(self._map) sum_count = \ sum_count_part.combineByKey(lambda val: val, lambda x, val: (x[0] + val[0], x[1] + val[1]), lambda x, y: (x[0] + y[0], x[1] + y[1])) fill = self._fill avg_tiles = \ sum_count.map(lambda (bounds, (sum_tile, cnt_tile)): (bounds, [[{'avg': (sum_tile[y, x] / cnt_tile[y, x]) if (cnt_tile[y, x] > 0) else fill, 'cnt': cnt_tile[y, x]} for x in range(sum_tile.shape[1])] for y in range(sum_tile.shape[0])])).collect() # Combine subset results to produce global map. # # The tiles below are NOT Nexus objects. They are tuples # with the time avg map data and lat-lon bounding box. a = np.zeros((nlats, nlons), dtype=np.float64, order='C') n = np.zeros((nlats, nlons), dtype=np.uint32, order='C') for tile in avg_tiles: if tile is not None: ((tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon), tile_stats) = tile tile_data = np.ma.array([[ tile_stats[y][x]['avg'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_cnt = np.array([[ tile_stats[y][x]['cnt'] for x in range(len(tile_stats[0])) ] for y in range(len(tile_stats))]) tile_data.mask = ~(tile_cnt.astype(bool)) y0 = self._lat2ind(tile_min_lat) y1 = y0 + tile_data.shape[0] - 1 x0 = self._lon2ind(tile_min_lon) x1 = x0 + tile_data.shape[1] - 1 if np.any(np.logical_not(tile_data.mask)): self.log.debug( 'writing tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) a[y0:y1 + 1, x0:x1 + 1] = tile_data n[y0:y1 + 1, x0:x1 + 1] = tile_cnt else: self.log.debug( 'All pixels masked in tile lat {0}-{1}, lon {2}-{3}, map y {4}-{5}, map x {6}-{7}' .format(tile_min_lat, tile_max_lat, tile_min_lon, tile_max_lon, y0, y1, x0, x1)) # Store global map in a NetCDF file. self._create_nc_file(a, 'tam.nc', 'val', fill=self._fill) # Create dict for JSON response results = [[{ 'avg': a[y, x], 'cnt': int(n[y, x]), 'lat': self._ind2lat(y), 'lon': self._ind2lon(x) } for x in range(a.shape[1])] for y in range(a.shape[0])] return TimeAvgMapSparkResults(results=results, meta={}, computeOptions=computeOptions)
def calc(self, computeOptions, **args): """ :param computeOptions: StatsComputeOptions :param args: dict :return: """ ds = computeOptions.get_dataset() if type(ds) != list and type(ds) != tuple: ds = (ds, ) if next(iter([clim for clim in ds if 'CLIM' in clim]), False): raise NexusProcessingException( reason="Cannot compute time series on a climatology", code=400) resultsRaw = [] spark_master, spark_nexecs, spark_nparts = computeOptions.get_spark_cfg( ) for shortName in ds: results, meta = self.getTimeSeriesStatsForBoxSingleDataSet( computeOptions.get_min_lat(), computeOptions.get_max_lat(), computeOptions.get_min_lon(), computeOptions.get_max_lon(), shortName, computeOptions.get_start_time(), computeOptions.get_end_time(), computeOptions.get_apply_seasonal_cycle_filter(), computeOptions.get_apply_low_pass_filter(), spark_master=spark_master, spark_nexecs=spark_nexecs, spark_nparts=spark_nparts) resultsRaw.append([results, meta]) results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = self.calculateComparisonStats(results, suffix="") except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) if computeOptions.get_apply_seasonal_cycle_filter(): try: s = self.calculateComparisonStats(results, suffix="Seasonal") stats = self._mergeDicts(stats, s) except Exception: tb = traceback.format_exc() self.log.warn( "Error when calculating Seasonal comparison stats:\n%s" % tb) if computeOptions.get_apply_low_pass_filter(): try: s = self.calculateComparisonStats(results, suffix="LowPass") stats = self._mergeDicts(stats, s) except Exception: tb = traceback.format_exc() self.log.warn( "Error when calculating LowPass comparison stats:\n%s" % tb) if computeOptions.get_apply_seasonal_cycle_filter( ) and computeOptions.get_apply_low_pass_filter(): try: s = self.calculateComparisonStats(results, suffix="SeasonalLowPass") stats = self._mergeDicts(stats, s) except Exception: tb = traceback.format_exc() self.log.warn( "Error when calculating SeasonalLowPass comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=computeOptions) return res