def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=True, applyLowPass=True, fill=-9999., spark_master="local[1]", spark_nexecs=1, spark_nparts=1): daysinrange = self._tile_service.find_days_in_range_asc( min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts_needed = min(spark_nparts, ndays) nexus_tiles_spark = [(min_lat, max_lat, min_lon, max_lon, ds, list(daysinrange_part), fill) for daysinrange_part in np.array_split( daysinrange, spark_nparts_needed)] # Launch Spark computations rdd = self._sc.parallelize(nexus_tiles_spark, spark_nparts_needed) results = rdd.map(TimeSeriesCalculator.calc_average_on_day).collect() # results = list(itertools.chain.from_iterable(results)) results = sorted(results, key=lambda entry: entry["time"]) filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) return results, {}
def getTimeSeriesStatsForBoxSingleDataSet(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, end_time=-1, applySeasonalFilter=True, applyLowPass=True): daysinrange = self._tile_service.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(min_lat, max_lat, min_lon, max_lon, ds, dayinseconds) results.append(result) else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', min_lat, max_lat, min_lon, max_lon, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in xrange(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in xrange(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in xrange(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] self.log.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results.append(result) pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) filt.applyAllFiltersOnField(results, 'mean', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'max', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) filt.applyAllFiltersOnField(results, 'min', applySeasonal=applySeasonalFilter, applyLowPass=applyLowPass) return results, {}
def getTimeSeriesStatsForBoxSingleDataSet(self, bounding_polygon, ds, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter=True, apply_low_pass_filter=True): the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc(bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], ds, start_seconds_from_epoch, end_seconds_from_epoch) logger.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if len(daysinrange) == 0: raise NoDataException(reason="No data found for selected timeframe") the_time = datetime.now() maxprocesses = int(self.algorithm_config.get("multiprocessing", "maxprocesses")) results = [] if maxprocesses == 1: calculator = TimeSeriesCalculator() for dayinseconds in daysinrange: result = calculator.calc_average_on_day(bounding_polygon.wkt, ds, dayinseconds) results += [result] if result else [] else: # Create a task to calc average difference for each day manager = Manager() work_queue = manager.Queue() done_queue = manager.Queue() for dayinseconds in daysinrange: work_queue.put( ('calc_average_on_day', bounding_polygon.wkt, ds, dayinseconds)) [work_queue.put(SENTINEL) for _ in range(0, maxprocesses)] # Start new processes to handle the work pool = Pool(maxprocesses) [pool.apply_async(pool_worker, (work_queue, done_queue)) for _ in range(0, maxprocesses)] pool.close() # Collect the results as [(day (in ms), average difference for that day)] for i in range(0, len(daysinrange)): result = done_queue.get() try: error_str = result['error'] logger.error(error_str) raise NexusProcessingException(reason="Error calculating average by day.") except KeyError: pass results += [result] if result else [] pool.terminate() manager.shutdown() results = sorted(results, key=lambda entry: entry["time"]) logger.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) if apply_seasonal_cycle_filter: the_time = datetime.now() for result in results: month = datetime.utcfromtimestamp(result['time']).month month_mean, month_max, month_min = self.calculate_monthly_average(month, bounding_polygon.wkt, ds) seasonal_mean = result['mean'] - month_mean seasonal_min = result['min'] - month_min seasonal_max = result['max'] - month_max result['meanSeasonal'] = seasonal_mean result['minSeasonal'] = seasonal_min result['maxSeasonal'] = seasonal_max logger.info( "Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) the_time = datetime.now() filtering.applyAllFiltersOnField(results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField(results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() logger.warn("Error calculating SeasonalLowPass filter:\n%s" % tb) logger.info( "LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), ds)) return results, {}
def calc(self, request, **args): """ :param request: StatsComputeOptions :param args: dict :return: """ start_time = datetime.now() ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested, normalize_dates = self.parse_arguments( request) metrics_record = self._create_metrics_record() resultsRaw = [] for shortName in ds: the_time = datetime.now() daysinrange = self._get_tile_service().find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName, start_seconds_from_epoch, end_seconds_from_epoch, metrics_callback=metrics_record.record_metrics) self.log.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) results, meta = spark_driver(daysinrange, bounding_polygon, shortName, self._tile_service_factory, metrics_record.record_metrics, normalize_dates, spark_nparts=spark_nparts, sc=self._sc) if apply_seasonal_cycle_filter: the_time = datetime.now() # get time series for _clim dataset shortName_clim = shortName + "_clim" daysinrange_clim = self._get_tile_service( ).find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName_clim, 0, SECONDS_IN_ONE_YEAR, metrics_callback=metrics_record.record_metrics) if len(daysinrange_clim) == 0: raise NexusProcessingException( reason= "There is no climatology data present for dataset " + shortName + ".") results_clim, _ = spark_driver(daysinrange_clim, bounding_polygon, shortName_clim, self._tile_service_factory, metrics_record.record_metrics, normalize_dates=False, spark_nparts=spark_nparts, sc=self._sc) clim_indexed_by_month = { datetime.utcfromtimestamp(result['time']).month: result for result in results_clim } if len(clim_indexed_by_month) < 12: raise NexusProcessingException( reason="There are only " + len(clim_indexed_by_month) + " months of climatology data for dataset " + shortName + ". A full year of climatology data is required for computing deseasoned timeseries." ) for result in results: month = datetime.utcfromtimestamp(result['time']).month result['meanSeasonal'] = result[ 'mean'] - clim_indexed_by_month[month]['mean'] result['minSeasonal'] = result[ 'min'] - clim_indexed_by_month[month]['min'] result['maxSeasonal'] = result[ 'max'] - clim_indexed_by_month[month]['max'] self.log.info("Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() filtering.applyAllFiltersOnField( results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() self.log.warn( "Error calculating SeasonalLowPass filter:\n%s" % tb) resultsRaw.append([results, meta]) self.log.info("LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) self.log.info("NetCDF generation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = TimeSeriesSparkHandlerImpl.calculate_comparison_stats( results) except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=None, minLat=bounding_polygon.bounds[1], maxLat=bounding_polygon.bounds[3], minLon=bounding_polygon.bounds[0], maxLon=bounding_polygon.bounds[2], ds=ds, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) total_duration = (datetime.now() - start_time).total_seconds() metrics_record.record_metrics(actual_time=total_duration) metrics_record.print_metrics(logger) self.log.info("Merging results and calculating comparisons took %s" % (str(datetime.now() - the_time))) return res
def calc(self, request, **args): """ :param request: StatsComputeOptions :param args: dict :return: """ ds, bounding_polygon, start_seconds_from_epoch, end_seconds_from_epoch, apply_seasonal_cycle_filter, apply_low_pass_filter, nparts_requested = self.parse_arguments( request) resultsRaw = [] for shortName in ds: the_time = datetime.now() daysinrange = self._tile_service.find_days_in_range_asc( bounding_polygon.bounds[1], bounding_polygon.bounds[3], bounding_polygon.bounds[0], bounding_polygon.bounds[2], shortName, start_seconds_from_epoch, end_seconds_from_epoch) self.log.info("Finding days in range took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) ndays = len(daysinrange) if ndays == 0: raise NoDataException( reason="No data found for selected timeframe") self.log.debug('Found {0} days in range'.format(ndays)) for i, d in enumerate(daysinrange): self.log.debug('{0}, {1}'.format(i, datetime.utcfromtimestamp(d))) spark_nparts = self._spark_nparts(nparts_requested) self.log.info('Using {} partitions'.format(spark_nparts)) the_time = datetime.now() results, meta = spark_driver(daysinrange, bounding_polygon, shortName, spark_nparts=spark_nparts, sc=self._sc) self.log.info("Time series calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) if apply_seasonal_cycle_filter: the_time = datetime.now() for result in results: month = datetime.utcfromtimestamp(result['time']).month month_mean, month_max, month_min = self.calculate_monthly_average( month, bounding_polygon.wkt, shortName) seasonal_mean = result['mean'] - month_mean seasonal_min = result['min'] - month_min seasonal_max = result['max'] - month_max result['meanSeasonal'] = seasonal_mean result['minSeasonal'] = seasonal_min result['maxSeasonal'] = seasonal_max self.log.info("Seasonal calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() filtering.applyAllFiltersOnField( results, 'mean', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'max', applySeasonal=False, applyLowPass=apply_low_pass_filter) filtering.applyAllFiltersOnField( results, 'min', applySeasonal=False, applyLowPass=apply_low_pass_filter) if apply_seasonal_cycle_filter and apply_low_pass_filter: try: filtering.applyFiltersOnField(results, 'meanSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'minSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") filtering.applyFiltersOnField(results, 'maxSeasonal', applySeasonal=False, applyLowPass=True, append="LowPass") except Exception as e: # If it doesn't work log the error but ignore it tb = traceback.format_exc() self.log.warn( "Error calculating SeasonalLowPass filter:\n%s" % tb) resultsRaw.append([results, meta]) self.log.info("LowPass filter calculation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() self._create_nc_file_time1d(np.array(results), 'ts.nc', 'mean', fill=-9999.) self.log.info("NetCDF generation took %s for dataset %s" % (str(datetime.now() - the_time), shortName)) the_time = datetime.now() results = self._mergeResults(resultsRaw) if len(ds) == 2: try: stats = TimeSeriesHandlerImpl.calculate_comparison_stats( results) except Exception: stats = {} tb = traceback.format_exc() self.log.warn("Error when calculating comparison stats:\n%s" % tb) else: stats = {} meta = [] for singleRes in resultsRaw: meta.append(singleRes[1]) res = TimeSeriesResults(results=results, meta=meta, stats=stats, computeOptions=None, minLat=bounding_polygon.bounds[1], maxLat=bounding_polygon.bounds[3], minLon=bounding_polygon.bounds[0], maxLon=bounding_polygon.bounds[2], ds=ds, startTime=start_seconds_from_epoch, endTime=end_seconds_from_epoch) self.log.info("Merging results and calculating comparisons took %s" % (str(datetime.now() - the_time))) return res