def test_nested_observe_progress(self): observer = MyProgressObserver() observer.deactivate() observer.activate() with observe_progress('computing', 4) as reporter: # do something that takes 1 unit reporter.worked(1) # do something that takes 1 unit reporter.worked(1) # do something that will take 2 units reporter.will_work(2) with observe_progress('loading', 4) as nested_reporter: # do something that takes 3 units nested_reporter.worked(3) # do something that takes 1 unit nested_reporter.worked(1) self.assertEqual([('begin', [('computing', 0.0, False)]), ('update', [('computing', 0.25, False)]), ('update', [('computing', 0.5, False)]), ('begin', [('computing', 0.5, False), ('loading', 0.0, False)]), ('update', [('computing', 0.875, False), ('loading', 0.75, False)]), ('update', [('computing', 1.0, False), ('loading', 1.0, False)]), ('end', [('computing', 1.0, False), ('loading', 1.0, True)]), ('end', [('computing', 1.0, True)])], observer.calls)
def test_nested_observe_progress_with_new_progress_observers(self): observer = MyProgressObserver() observer.activate() nested_observer = MyProgressObserver() with observe_progress('computing', 4) as progress_reporter: # do something that takes 1 unit progress_reporter.worked(1) # do something that takes 1 unit progress_reporter.worked(1) with new_progress_observers(nested_observer): with observe_progress('loading', 4) as progress_reporter_2: # do something that takes 3 units progress_reporter_2.worked(3) # do something that takes 1 unit progress_reporter_2.worked(1) # do something that takes 1 unit progress_reporter.worked(2) self.assertEqual([('begin', [('computing', 0.0, False)]), ('update', [('computing', 0.25, False)]), ('update', [('computing', 0.5, False)]), ('update', [('computing', 1.0, False)]), ('end', [('computing', 1.0, True)])], observer.calls) self.assertEqual([('begin', [('loading', 0.0, False)]), ('update', [('loading', 0.75, False)]), ('update', [('loading', 1.0, False)]), ('end', [('loading', 1.0, True)])], nested_observer.calls)
def test_nested_observe_progress_with_exception(self): observer = MyProgressObserver(record_errors=True) observer.activate() try: with observe_progress('computing', 10) as reporter: # do something that takes 1 unit reporter.worked(1) # do something that takes 1 unit reporter.worked(1) # do something that will take 2 units reporter.will_work(8) with observe_progress('loading', 100) as nested_reported: # do something that takes 3 units nested_reported.worked(15) # now - BANG! raise ValueError('Failed to load') except ValueError: pass self.assertEqual(7, len(observer.calls)) self.assertEqual([ ('begin', [('computing', 0.0, False, None)]), ('update', [('computing', 0.1, False, None)]), ('update', [('computing', 0.2, False, None)]), ('begin', [('computing', 0.2, False, None), ('loading', 0.0, False, None)]), ('update', [('computing', 0.32, False, None), ('loading', 0.15, False, None)]), ], observer.calls[0:-2]) self.assertEqual(2, len(observer.calls[-2])) event, states = observer.calls[-2] self.assertEqual('end', event) self.assertEqual(2, len(states)) self.assertEqual(4, len(states[0])) self.assertEqual(4, len(states[1])) self.assertEqual(('computing', 0.32, False), states[0][0:-1]) self.assertEqual(('loading', 0.15, True), states[1][0:-1]) error = states[0][-1] self.assertIsNone(error) error = states[1][-1] self.assertIsInstance(error, tuple) exc_type, exc_value, exc_traceback = error self.assertEqual('ValueError', exc_type) self.assertEqual('Failed to load', exc_value) self.assertIsInstance(exc_traceback, list) self.assertEqual(2, len(observer.calls[-1])) event, states = observer.calls[-1] self.assertEqual('end', event) self.assertEqual(1, len(states)) self.assertEqual(4, len(states[0])) self.assertEqual(('computing', 0.32, True), states[0][0:-1]) error = states[0][-1] self.assertIsInstance(error, tuple) exc_type, exc_value, exc_traceback = error self.assertEqual('ValueError', exc_type) self.assertEqual('Failed to load', exc_value) self.assertIsInstance(exc_traceback, list)
def _generate_cube(self, request: CubeGeneratorRequestLike) \ -> CubeGeneratorResult: request = CubeGeneratorRequest.normalize(request).for_service() response = self._submit_gen_request(request) cubegen_id, result, _ = \ self._get_cube_generator_result(response) if result is not None: return result last_worked = 0 with observe_progress('Generating cube', 100) as cm: while True: time.sleep(self._progress_period) response = requests.get( self.endpoint_op(f'cubegens/{cubegen_id}'), headers=self.auth_headers) _, result, progress = \ self._get_cube_generator_result(response) if result is not None: return result if progress is not None and len(progress) > 0: progress_state = progress[0].state total_work = progress_state.total_work progress = progress_state.progress or 0 worked = progress * total_work work = 100 * ((worked - last_worked) / total_work) if work > 0: cm.worked(work) last_worked = worked
def describe_datasets(self) -> Sequence[DatasetDescriptor]: descriptors = [] with observe_progress('Fetching dataset information', len(self._input_configs)) as progress: for input_config in self._input_configs: descriptors.append(self._describe_dataset(input_config)) progress.worked(1) return descriptors
def open_cube(self, input_config: InputConfig) -> TransformedCube: cube_config = self._cube_config cube_params = cube_config.to_dict() opener_id = input_config.opener_id store_params = input_config.store_params or {} open_params = input_config.open_params or {} with observe_progress('reading cube', 3) as observer: try: if input_config.store_id: store_instance = get_data_store_instance( input_config.store_id, store_params=store_params, store_pool=self._store_pool) store = store_instance.store if opener_id is None: opener_id = self._get_opener_id(input_config, store) opener = store open_params = dict(open_params) open_params['opener_id'] = opener_id else: opener = new_data_opener(opener_id) open_params = dict(open_params) open_params.update(store_params) open_params_schema = opener.get_open_data_params_schema( input_config.data_id) dataset_open_params = { k: v for k, v in cube_params.items() if k in open_params_schema.properties } observer.worked(1) dataset = opener.open_data(input_config.data_id, **open_params, **dataset_open_params) observer.worked(1) except DataStoreError as dse: raise CubeGeneratorError(f'{dse}', status_code=400) from dse # Turn dataset into cube and grid_mapping try: cube, gm, _ = decode_cube(dataset, normalize=True) except DatasetIsNotACubeError as e: raise CubeGeneratorError(f'{e}') from e observer.worked(1) if dataset_open_params: drop_names = [ k for k in dataset_open_params.keys() if k not in _STEADY_CUBE_CONFIG_NAMES ] cube_config = cube_config.drop_props(drop_names) return cube, gm, cube_config
def combine_cubes(self, t_cubes: Sequence[TransformedCube]) \ -> TransformedCube: cube, gm, _ = t_cubes[0] if len(t_cubes) == 1: return cube, gm, self._cube_config with observe_progress('merging cubes', 1) as observer: cube = xr.merge([t_cube[0] for t_cube in t_cubes]) observer.worked(1) return cube, gm, self._cube_config
def resample_and_merge_cubes(cubes: List[xr.Dataset], cube_config: CubeConfig) -> xr.Dataset: with observe_progress('Resampling cube(s)', len(cubes) + 1) as progress: resampled_cubes = [] for cube in cubes: resampled_cube = resample_cube(cube, cube_config) resampled_cubes.append(resampled_cube) progress.worked(1) merged_cube = xr.merge(resampled_cubes) if len( resampled_cubes) > 1 else resampled_cubes[0] progress.worked(1) return merged_cube
def main(gen_config_path: str, store_configs_path: str = None, verbose: bool = False): """ Generator tool for data cubes. Creates cube views from one or more cube stores, resamples them to a common grid, optionally performs some cube transformation, and writes the resulting cube to some target cube store. *gen_config_path* is the cube generator configuration. It may be provided as a JSON or YAML file (file extensions ".json" or ".yaml"). If the *gen_config_path* argument is omitted, it is expected that the cube generator configuration is piped as a JSON string. *store_configs_path* is a path to a JSON file with data store configurations. It is a mapping of names to configured stores. Entries are dictionaries that have a mandatory "store_id" property which is a name of a registered xcube data store. The optional "store_params" property may define data store specific parameters. :param gen_config_path: Cube generation configuration. It may be provided as a JSON or YAML file (file extensions ".json" or ".yaml"). If the REQUEST file argument is omitted, it is expected that the cube generator configuration is piped as a JSON string. :param store_configs_path: A JSON file that maps store names to parameterized stores. :param verbose: Whether to output progress information to stdout. """ store_pool = DataStorePool.from_file( store_configs_path) if store_configs_path else DataStorePool() gen_config = GenConfig.from_file(gen_config_path, verbose=verbose) if gen_config.callback_config: ApiProgressCallbackObserver(gen_config.callback_config).activate() if verbose: ConsoleProgressObserver().activate() with observe_progress('Generating cube', 100) as cm: cm.will_work(10) cubes = open_cubes(gen_config.input_configs, cube_config=gen_config.cube_config, store_pool=store_pool) cm.will_work(10) cube = resample_and_merge_cubes(cubes, cube_config=gen_config.cube_config) cm.will_work(80) data_id = write_cube(cube, output_config=gen_config.output_config, store_pool=store_pool) if verbose: print('Cube "{}" generated within {:.2f} seconds'.format( str(data_id), cm.state.total_time))
def open_cubes(input_configs: Sequence[InputConfig], cube_config: CubeConfig, store_pool: DataStorePool = None): cubes = [] all_cube_params = cube_config.to_dict() with observe_progress('Opening input(s)', len(input_configs)) as progress: for input_config in input_configs: open_params = {} opener_id = input_config.opener_id if input_config.store_id: store_instance = get_data_store_instance( input_config.store_id, store_params=input_config.store_params, store_pool=store_pool) store = store_instance.store if opener_id is None: opener_ids = store.get_data_opener_ids( data_id=input_config.data_id, type_specifier=TYPE_SPECIFIER_CUBE) if not opener_ids: raise DataStoreError( f'Data store "{input_config.store_id}" does not support data cubes' ) opener_id = opener_ids[0] opener = store open_params.update(opener_id=opener_id, **input_config.open_params) else: opener = new_data_opener(opener_id) open_params.update(**input_config.store_params, **input_config.open_params) open_params_schema = opener.get_open_data_params_schema( input_config.data_id) cube_params = { k: v for k, v in all_cube_params.items() if k in open_params_schema.properties } cube = opener.open_data(input_config.data_id, **open_params, **cube_params) cubes.append(cube) progress.worked(1) return cubes
def transform_cube(t_cube: TransformedCube, transformer: CubeTransformer, label: str = '') -> TransformedCube: empty_cube = is_empty_cube(t_cube[0]) identity = isinstance(transformer, CubeIdentity) if not label: label = f'{type(transformer).__name__}' if identity: label += ' (step not applicable)' elif empty_cube: label += ' (step not applicable, empty cube)' with observe_progress(label, 1) as progress: if not (identity or empty_cube): t_cube = transformer.transform_cube(*t_cube) t_cube = strip_cube(t_cube[0]), t_cube[1], t_cube[2] progress.worked(1) return t_cube
def test_observe_progress(self): observer = MyProgressObserver() observer.activate() with observe_progress('computing', 4) as reporter: # do something that takes 1 unit reporter.worked(1) # do something that takes 1 unit reporter.worked(1) # do something that takes 2 units reporter.worked(2) self.assertIsInstance(reporter.state, ProgressState) self.assertIsInstance(reporter.state.total_time, float) self.assertTrue(reporter.state.total_time >= 0.0) self.assertEqual([('begin', [('computing', 0.0, False)]), ('update', [('computing', 0.25, False)]), ('update', [('computing', 0.5, False)]), ('update', [('computing', 1.0, False)]), ('end', [('computing', 1.0, True)])], observer.calls)
def write_cube(cube: xr.Dataset, output_config: OutputConfig, store_pool: DataStorePool = None) -> str: with observe_progress('Writing output', 1) as progress: write_params = dict() if output_config.store_id: store_instance = get_data_store_instance(output_config.store_id, store_params=output_config.store_params, store_pool=store_pool) writer = store_instance.store write_params.update(writer_id=output_config.writer_id, **output_config.write_params) else: writer = new_data_writer(output_config.writer_id) write_params.update(**output_config.store_params, **output_config.write_params) # TODO: develop an adapter from Dask callback to ProgressObserver and use it here. data_id = writer.write_data(cube, data_id=output_config.data_id, replace=output_config.replace or False, **write_params) progress.worked(1) return data_id
def __generate_cube(self, request: CubeGeneratorRequest) \ -> CubeGeneratorResult: cube_config = request.cube_config \ if request.cube_config is not None else CubeConfig() opener = CubeOpener(cube_config, store_pool=self._store_pool) subsetter = CubeSubsetter() resampler_xy = CubeResamplerXY() resampler_t = CubeResamplerT() combiner = CubesCombiner(cube_config) rechunker = CubeRechunker() code_config = request.code_config if code_config is not None: code_executor = CubeUserCodeExecutor(code_config) post_rechunker = CubeRechunker() else: code_executor = CubeIdentity() post_rechunker = CubeIdentity() md_adjuster = CubeMetadataAdjuster() cube_writer = CubeWriter(request.output_config, store_pool=self._store_pool) num_inputs = len(request.input_configs) # Estimated workload: opener_work = 10 resampler_t_work = 1 resampler_xy_work = 20 subsetter_work = 1 combiner_work = num_inputs rechunker_work = 1 executor_work = 1 post_rechunker_work = 1 metadata_adjuster_work = 1 writer_work = 100 # this is where dask processing takes place total_work = (opener_work + subsetter_work + resampler_t_work + resampler_xy_work) * num_inputs \ + combiner_work \ + rechunker_work \ + executor_work \ + post_rechunker_work \ + metadata_adjuster_work \ + writer_work t_cubes = [] with observe_progress('Generating cube', total_work) as progress: for input_config in request.input_configs: progress.will_work(opener_work) t_cube = opener.open_cube(input_config) progress.will_work(subsetter_work) t_cube = transform_cube(t_cube, subsetter, 'subsetting') progress.will_work(resampler_t_work) t_cube = transform_cube(t_cube, resampler_t, 'resampling in time') progress.will_work(resampler_xy_work) t_cube = transform_cube(t_cube, resampler_xy, 'resampling in space') t_cubes.append(t_cube) progress.will_work(combiner_work) t_cube = combiner.combine_cubes(t_cubes) progress.will_work(rechunker_work) t_cube = transform_cube(t_cube, rechunker, 'rechunking') progress.will_work(executor_work) t_cube = transform_cube(t_cube, code_executor, 'executing user code') progress.will_work(post_rechunker_work) t_cube = transform_cube(t_cube, post_rechunker, 'post-rechunking') progress.will_work(metadata_adjuster_work) t_cube = transform_cube(t_cube, md_adjuster, 'adjusting metadata') progress.will_work(writer_work) cube, gm, _ = t_cube if not is_empty_cube(cube): data_id, cube = cube_writer.write_cube(cube, gm) self._generated_data_id = data_id self._generated_cube = cube self._generated_gm = gm else: self._generated_data_id = None self._generated_cube = None self._generated_gm = None total_time = progress.state.total_time if self._generated_data_id is not None: return CubeGeneratorResult(status='ok', status_code=201, result=CubeReference(data_id=data_id), message=f'Cube generated successfully' f' after {total_time:.2f} seconds') else: return CubeGeneratorResult( status='warning', status_code=422, message=f'An empty cube has been generated' f' after {total_time:.2f} seconds.' f' No data has been written at all.')