def test_coverage_types(self): # Make a simple dataset and start ingestion, pretty standard stuff. ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() cov = DatasetManagementService._get_coverage(dataset_id=dataset_id) self.assertIsInstance(cov, ViewCoverage) cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id) self.assertIsInstance(cov, SimplexCoverage)
def setUp(self): mock_clients = self._create_service_mock('dataset_management') self.dataset_management = DatasetManagementService() self.dataset_management.clients = mock_clients self.mock_rr_create = self.dataset_management.clients.resource_registry.create self.mock_rr_read = self.dataset_management.clients.resource_registry.read self.mock_rr_update = self.dataset_management.clients.resource_registry.update self.mock_rr_delete = self.dataset_management.clients.resource_registry.delete
def fill_temporal_gap(self, dataset_id, gap_coverage_path=None, gap_coverage_id=None): if gap_coverage_path is None and gap_coverage_id is None: raise ValueError('Must specify either \'gap_coverage_path\' or \'gap_coverage_id\'') if gap_coverage_path is None: gap_coverage_path = self.get_coverage_path(gap_coverage_id) from coverage_model import AbstractCoverage gap_cov = AbstractCoverage.load(gap_coverage_path) self.pause_ingestion(self.get_stream_id(dataset_id)) DatasetManagementService._splice_coverage(dataset_id, gap_cov)
def add_granule(self, stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' if stream_id in self._bad_coverages: log.info( 'Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error( "Couldn't open coverage: %s", DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) raise CorruptionError(e.message) if not coverage: log.error( 'Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- if rdt[rdt.temporal_parameter] is None: log.warning("Empty granule received") return # Parse the RDT and set hte values in the coverage self.insert_values(coverage, rdt, stream_id) # Force the data to be flushed DatasetManagementService._save_coverage(coverage) self.update_metadata(dataset_id, rdt) try: window = rdt[rdt.temporal_parameter][[0, -1]] window = window.tolist() except (ValueError, IndexError): window = None self.dataset_changed(dataset_id, window)
def test_coverage_ingest(self): stream_id = self.pubsub_management.create_stream() dataset_id = self.create_dataset() # I freaking hate this bug self.get_datastore(dataset_id) ingestion_config_id = self.get_ingestion_config() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id ) black_box = CoverageCraft() black_box.rdt["time"] = np.arange(20) black_box.rdt["temp"] = np.random.random(20) * 10 black_box.sync_with_granule() granule = black_box.to_granule() publisher = SimpleStreamPublisher.new_publisher(self.container, self.exchange_point_name, stream_id) publisher.publish(granule) self.wait_until_we_have_enough_granules(dataset_id, 1) coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage() comp = black_box.rdt["time"] == np.arange(20) self.assertTrue(comp.all()) black_box = CoverageCraft() black_box.rdt["time"] = np.arange(20) + 20 black_box.rdt["temp"] = np.random.random(20) * 10 black_box.sync_with_granule() granule = black_box.to_granule() publisher.publish(granule) self.wait_until_we_have_enough_granules(dataset_id, 2) coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage() comp = black_box.rdt["time"] == np.arange(40) self.assertTrue(comp.all()) granule = self.data_retriever.retrieve(dataset_id) black_box = CoverageCraft() black_box.sync_rdt_with_granule(granule) comp = black_box.rdt["time"] == np.arange(40) self.assertTrue(comp.all())
def _get_param_dict_by_name(self, name): dict_obj = self.RR2.find_resources_by_name(RT.ParameterDictionary, name)[0] parameter_contexts = \ self.RR2.find_parameter_contexts_of_parameter_dictionary_using_has_parameter_context(dict_obj._id) return DatasetManagementService.build_parameter_dictionary( dict_obj, parameter_contexts)
def execute_retrieve(self): """ execute_retrieve Executes a retrieval and returns the result as a value in lieu of publishing it on a stream """ try: coverage = DatasetManagementService._get_coverage(self.dataset_id, mode="r") if coverage.num_timesteps == 0: log.info("Reading from an empty coverage") rdt = RecordDictionaryTool(param_dictionary=coverage.parameter_dictionary) else: rdt = self._coverage_to_granule( coverage=coverage, start_time=self.start_time, end_time=self.end_time, stride_time=self.stride_time, parameters=self.parameters, tdoa=self.tdoa, ) except: log.exception("Problems reading from the coverage") raise BadRequest("Problems reading from the coverage") finally: coverage.close(timeout=5) return rdt.to_granule()
def test_retrieve_cache(self): DataRetrieverService._refresh_interval = 1 datasets = [self.make_simple_dataset() for i in xrange(10)] for stream_id, route, stream_def_id, dataset_id in datasets: coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') coverage.insert_timesteps(10) coverage.set_parameter_values('time', np.arange(10)) coverage.set_parameter_values('temp', np.arange(10)) # Verify cache hit and refresh dataset_ids = [i[3] for i in datasets] self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]] # Verify that it was hit and it's now in there self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache) gevent.sleep(DataRetrieverService._refresh_interval + 0.2) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]] self.assertTrue(age2 != age) for dataset_id in dataset_ids: DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache) stream_id, route, stream_def, dataset_id = datasets[0] self.start_ingestion(stream_id, dataset_id) DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)
def find_function(self,name): res_obj, _ = Container.instance.resource_registry.find_resources(name=name, restype=RT.ParameterFunction, id_only=False) if res_obj: return res_obj[0]._id, DatasetManagementService.get_coverage_function(res_obj[0]) else: raise KeyError('%s was never loaded' % name)
def get_editable_coverage(self, dataset_id): sid = self.get_stream_id(dataset_id) # Check if we already have the coverage if sid in self._paused_streams: cov = self._w_covs[sid] # If it's not closed, return it if not cov.closed: return cov # Otherwise, remove it from self._ro_covs and carry on del self._w_covs[sid] self.pause_ingestion(sid) if not self._context_managed: warn_user( 'Warning: Coverages will remain open until they are closed or go out of scope - ' 'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)' ) try: self._w_covs[sid] = DatasetManagementService._get_simplex_coverage( dataset_id, mode='w') return self._w_covs[sid] except: self.resume_ingestion(sid) raise
def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path( dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2
def delete_dataset(self, agent_instance_id, resource_id): res_obj = self.rr.read(resource_id) dpms = DataProductManagementServiceProcessClient(process=self) # Find data products from device id count_ds = 0 dp_objs, _ = self.rr.find_objects(resource_id, PRED.hasOutputProduct, RT.DataProduct, id_only=False) for dp_obj in dp_objs: if dpms.is_persisted(dp_obj._id): raise BadRequest("DataProduct %s '%s' is currently persisted", dp_obj._id, dp_obj.name) ds_objs, _ = self.rr.find_objects(dp_obj._id, PRED.hasDataset, RT.Dataset, id_only=False) for ds_obj in ds_objs: # Delete coverage cov_path = DatasetManagementService._get_coverage_path(ds_obj._id) if os.path.exists(cov_path): log.info("Removing coverage tree at %s", cov_path) shutil.rmtree(cov_path) else: raise OSError("Coverage path does not exist %s" % cov_path) # Delete Dataset and associations self.rr.delete(ds_obj._id) count_ds += 1 log.info("Datasets and coverages deleted for device %s '%s': %s", resource_id, res_obj.name, count_ds)
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() # TODO: Destroy the metadata files # TODO: RE-analyze coverage # TODO: Should be corrupt, take action to repair if so # Repair the metadata files dr.repair_metadata() # TODO: Re-analyze fixed coverage fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # TODO: Turn these into meaningful Asserts self.assertEqual(len(dr_result.get_brick_corruptions()), 0) self.assertEqual(len(dr_result.get_brick_size_ratios()), 8) self.assertEqual(len(dr_result.get_corruptions()), 0) self.assertEqual(len(dr_result.get_master_corruption()), 0) self.assertEqual(len(dr_result.get_param_corruptions()), 0) self.assertEqual(len(dr_result.get_param_size_ratios()), 64) self.assertEqual(len(dr_result.get_master_size_ratio()), 1) self.assertEqual(len(dr_result.get_size_ratios()), 73) self.assertEqual(dr_result.master_status[1], 'NORMAL') self.assertFalse(dr_result.is_corrupt) self.assertEqual(dr_result.param_file_count, 64) self.assertEqual(dr_result.brick_file_count, 8) self.assertEqual(dr_result.total_file_count, 73) # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() orig_cov.close() # Corrupt the Master File fo = open(cov._persistence_layer.master_manager.file_path, "wb") fo.write('Junk') fo.close() # Corrupt the lon Parameter file fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb") fo.write('Junk') fo.close() corrupt_res = dr.analyze(reanalyze=True) self.assertTrue(corrupt_res.is_corrupt) # Repair the metadata files dr.repair(reanalyze=True) fixed_res = dr.analyze(reanalyze=True) self.assertFalse(fixed_res.is_corrupt) fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() fixed_cov.close() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def test_get_dataset_to_xml(self): dataset_id = self._make_dataset() coverage_path = DatasetManagementService()._get_coverage_path( dataset_id) cov = SimplexCoverage.load(coverage_path) xml_str = self.rp.get_dataset_xml(coverage_path) dom = parseString(xml_str) node = dom.getElementsByTagName('addAttributes') metadata = node[0] for n in metadata.childNodes: if n.nodeType != 3: if n.attributes["name"].value == "title": self.assertEquals(cov.name, n.childNodes[0].nodeValue) if n.attributes["name"].value == "institution": self.assertEquals('OOI', n.childNodes[0].nodeValue) if n.attributes["name"].value == "infoUrl": self.assertEquals(self.rp.pydap_url + cov.name, n.childNodes[0].nodeValue) parameters = [] node = dom.getElementsByTagName('sourceName') for n in node: if n.nodeType != 3: parameters.append(str(n.childNodes[0].nodeValue)) cov_params = [key for key in cov.list_parameters()] self.assertEquals(parameters, cov_params) cov.close()
def get_pfunc(self, pfid): # Preload Case if not pfid: raise TypeError('No parameter function id specified') if pfid.startswith('PFID'): if pfid not in self.resource_objs: raise KeyError('Function %s was not loaded' % pfid) pf = self.resource_objs[pfid] func = DatasetManagementService.get_coverage_function(pf) return func # System Case else: pf = Container.instance.resource_registry.read(pfid) func = DatasetManagementService.get_coverage_function(pf) return func
def get_last_granule(cls, container, dataset_id): dsm_cli = DatasetManagementServiceClient() dataset = dsm_cli.read_dataset(dataset_id) cc = container datastore_name = dataset.datastore_name view_name = dataset.view_name datastore = cc.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA) opts = dict( start_key = [dataset_id, {}], end_key = [dataset_id, 0], descending = True, limit = 1, include_docs = True ) results = datastore.query_view(view_name,opts=opts) if not results: raise NotFound('A granule could not be located.') if results[0] is None: raise NotFound('A granule could not be located.') doc = results[0].get('doc') if doc is None: return None ts = float(doc.get('ts_create',0)) coverage = DatasetManagementService._get_coverage(dataset_id) rdt = cls._coverage_to_granule(coverage,tdoa=slice(cls.get_relative_time(coverage,ts),None)) coverage.close(timeout=5) return rdt.to_granule()
def test_stream_def_crud(self): # Test Creation pdict = DatasetManagementService.get_parameter_dictionary_by_name('ctd_parsed_param_dict') stream_definition_id = self.pubsub_management.create_stream_definition('ctd parsed', parameter_dictionary_id=pdict.identifier) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_definition_id) # Make sure there is an assoc self.assertTrue(self.resource_registry.find_associations(subject=stream_definition_id, predicate=PRED.hasParameterDictionary, object=pdict.identifier, id_only=True)) # Test Reading stream_definition = self.pubsub_management.read_stream_definition(stream_definition_id) self.assertTrue(PubsubManagementService._compare_pdicts(pdict.dump(), stream_definition.parameter_dictionary)) # Test comparisons in_stream_definition_id = self.pubsub_management.create_stream_definition('L0 products', parameter_dictionary_id=pdict.identifier, available_fields=['time','temp','conductivity','pressure']) self.addCleanup(self.pubsub_management.delete_stream_definition, in_stream_definition_id) out_stream_definition_id = in_stream_definition_id self.assertTrue(self.pubsub_management.compare_stream_definition(in_stream_definition_id, out_stream_definition_id)) self.assertTrue(self.pubsub_management.compatible_stream_definitions(in_stream_definition_id, out_stream_definition_id)) out_stream_definition_id = self.pubsub_management.create_stream_definition('L2 Products', parameter_dictionary_id=pdict.identifier, available_fields=['time','salinity','density']) self.addCleanup(self.pubsub_management.delete_stream_definition, out_stream_definition_id) self.assertFalse(self.pubsub_management.compare_stream_definition(in_stream_definition_id, out_stream_definition_id)) self.assertTrue(self.pubsub_management.compatible_stream_definitions(in_stream_definition_id, out_stream_definition_id))
def test_ingestion_failover(self): stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() self.start_ingestion(stream_id, dataset_id) event = Event() def cb(*args, **kwargs): event.set() sub = EventSubscriber(event_type="ExceptionEvent", callback=cb, origin="stream_exception") sub.start() self.publish_fake_data(stream_id, route) self.wait_until_we_have_enough_granules(dataset_id, 40) file_path = DatasetManagementService._get_coverage_path(dataset_id) master_file = os.path.join(file_path, '%s_master.hdf5' % dataset_id) with open(master_file, 'w') as f: f.write('this will crash HDF') self.publish_hifi(stream_id, route, 5) self.assertTrue(event.wait(10)) sub.stop()
def get_coverage_path(self, dataset_id): pth = DatasetManagementService._get_coverage_path(dataset_id) if not os.path.exists(pth): raise ValueError( 'Coverage with id \'{0}\' does not exist!'.format(dataset_id)) return pth
def execute_retrieve(self): ''' execute_retrieve Executes a retrieval and returns the result as a value in lieu of publishing it on a stream ''' try: coverage = DatasetManagementService._get_coverage(self.dataset_id, mode='r') if coverage.num_timesteps == 0: log.info('Reading from an empty coverage') rdt = RecordDictionaryTool( param_dictionary=coverage.parameter_dictionary) else: rdt = self._coverage_to_granule(coverage=coverage, start_time=self.start_time, end_time=self.end_time, stride_time=self.stride_time, parameters=self.parameters, tdoa=self.tdoa) except: log.exception('Problems reading from the coverage') raise BadRequest('Problems reading from the coverage') finally: coverage.close(timeout=5) return rdt.to_granule()
def get_coverage(cls, data_product_id): ''' Memoization (LRU) of _get_coverage ''' if not data_product_id: return try: result, ts = cls._coverages.pop(data_product_id) if (time.time() - ts) > cls.CACHE_EXPIRATION: result.close() raise KeyError(data_product_id) except KeyError: if data_product_id is None: return None resource_registry = Container.instance.resource_registry dataset_ids, _ = resource_registry.find_objects(data_product_id, PRED.hasDataset, id_only=True) if not dataset_ids: return None dataset_id = dataset_ids[0] result = DatasetManagementService._get_coverage(dataset_id, mode='r') result.value_caching = False ts = time.time() if result is None: return None if len(cls._coverages) >= cls.CACHE_LIMIT: key, value = cls._coverages.popitem(0) coverage, ts = value coverage.close(timeout=5) cls._coverages[dataset_id] = result, ts return result
def get_last_granule(cls, container, dataset_id): dsm_cli = DatasetManagementServiceClient() dataset = dsm_cli.read_dataset(dataset_id) cc = container datastore_name = dataset.datastore_name view_name = dataset.view_name datastore = cc.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA) opts = dict( start_key = [dataset_id, {}], end_key = [dataset_id, 0], descending = True, limit = 1, include_docs = True ) results = datastore.query_view(view_name,opts=opts) if not results: raise NotFound('A granule could not be located.') if results[0] is None: raise NotFound('A granule could not be located.') doc = results[0].get('doc') if doc is None: return None ts = float(doc.get('ts_create',0)) coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage(start_time=ts,end_time=None) granule = black_box.to_granule() return granule
def get_last_values(cls, dataset_id): coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage(tdoa=slice(-1,None)) granule = black_box.to_granule() return granule
def delete_dataset(self, agent_instance_id, resource_id): """Deletes dataset and coverage files for all of a device's data products""" res_obj = self.rr.read(resource_id) dpms = DataProductManagementServiceProcessClient(process=self) # Find data products from device id count_ds = 0 dp_objs, _ = self.rr.find_objects(resource_id, PRED.hasOutputProduct, RT.DataProduct, id_only=False) for dp_obj in dp_objs: if dpms.is_persisted(dp_obj._id, headers=self._get_system_actor_headers()): if self.force: log.warn("DataProduct %s '%s' is currently persisted - continuing", dp_obj._id, dp_obj.name) else: raise BadRequest("DataProduct %s '%s' is currently persisted. Use force=True to ignore", dp_obj._id, dp_obj.name) ds_objs, _ = self.rr.find_objects(dp_obj._id, PRED.hasDataset, RT.Dataset, id_only=False) for ds_obj in ds_objs: # Delete coverage cov_path = DatasetManagementService._get_coverage_path(ds_obj._id) if os.path.exists(cov_path): log.info("Removing coverage tree at %s", cov_path) shutil.rmtree(cov_path) else: log.warn("Coverage path does not exist %s" % cov_path) # Delete Dataset and associations self.rr.delete(ds_obj._id) count_ds += 1 log.info("Datasets and coverages deleted for device %s '%s': %s", resource_id, res_obj.name, count_ds)
def persist_or_timeout(self, stream_id, rdt): ''' A loop that tries to parse and store a granule for up to five minutes, and waits an increasing amount of time each iteration. ''' done = False timeout = 2 start = time.time() while not done: if self.parse_granule(stream_id, rdt, start, done): return # We're all done, everything worked if (time.time() - start) > MAX_RETRY_TIME: # After a while, give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) timeout = min(60 * 5, timeout * 2)
def test_ingestion_failover(self): stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset( ) self.start_ingestion(stream_id, dataset_id) event = Event() def cb(*args, **kwargs): event.set() sub = EventSubscriber(event_type="ExceptionEvent", callback=cb, origin="stream_exception") sub.start() self.publish_fake_data(stream_id, route) self.wait_until_we_have_enough_granules(dataset_id, 40) file_path = DatasetManagementService._get_coverage_path(dataset_id) master_file = os.path.join(file_path, '%s_master.hdf5' % dataset_id) with open(master_file, 'w') as f: f.write('this will crash HDF') self.publish_hifi(stream_id, route, 5) self.assertTrue(event.wait(10)) sub.stop()
def dead_man_timeout(self, stream_id, callback, *args, **kwargs): done = False timeout = 2 start = time.time() while not done: try: callback(*args, **kwargs) done = True except: log.exception("An issue with coverage, retrying after a bit") if (time.time() - start) > 3600: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id), ) raise if stream_id in self._coverages: log.info("Popping coverage for stream %s", stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2
def get_pfunc(self,pfid): # Preload Case if not pfid: raise TypeError('No parameter function id specified') if pfid.startswith('PFID'): if pfid not in self.resource_objs: raise KeyError('Function %s was not loaded' % pfid) pf = self.resource_objs[pfid] func = DatasetManagementService.get_coverage_function(pf) return func # System Case else: pf = Container.instance.resource_registry.read(pfid) func = DatasetManagementService.get_coverage_function(pf) return func
def apply_to_dataset(self, dataset, calibration_update): cov = DatasetManagementService._get_coverage(dataset, mode='r+') try: self.set_sparse_values(cov, calibration_update) self.publish_calibration_event(dataset, calibration_update.keys()) finally: cov.close()
def get_editable_coverage(self, dataset_id): sid = self.get_stream_id(dataset_id) if sid in self._paused_streams: return self._w_covs[sid] self.pause_ingestion(sid) self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w') return self._w_covs[sid]
def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- if rdt[rdt.temporal_parameter] is None: log.warning("Empty granule received") return # Parse the RDT and set hte values in the coverage self.insert_values(coverage, rdt, stream_id) # Force the data to be flushed DatasetManagementService._save_coverage(coverage) self.update_metadata(dataset_id, rdt) try: window = rdt[rdt.temporal_parameter][[0,-1]] window = window.tolist() except (ValueError, IndexError): window = None self.dataset_changed(dataset_id, window)
def register_dap_dataset(self, dataset_id, data_product_name=''): coverage_path = DatasetManagementService._get_coverage_path(dataset_id) try: self.add_dataset_to_xml(coverage_path=coverage_path, product_name=data_product_name) self.create_symlink(coverage_path, self.pydap_data_path) except: # We don't re-raise to prevent clients from bombing out... log.exception('Problem registering dataset') log.error('Failed to register dataset for coverage path %s' % coverage_path)
def test_granule(self): pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True) stream_def_id = self.pubsub_management.create_stream_definition('ctd', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':"GA03FLMA-RI001-13-CTDMOG999"}) pdict = DatasetManagementService.get_parameter_dictionary_by_name('ctd_parsed_param_dict') self.addCleanup(self.pubsub_management.delete_stream_definition,stream_def_id) stream_id, route = self.pubsub_management.create_stream('ctd_stream', 'xp1', stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream,stream_id) publisher = StandaloneStreamPublisher(stream_id, route) subscriber = StandaloneStreamSubscriber('sub', self.verify_incoming) subscriber.start() self.addCleanup(subscriber.stop) subscription_id = self.pubsub_management.create_subscription('sub', stream_ids=[stream_id]) self.pubsub_management.activate_subscription(subscription_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) rdt['temp'] = np.random.randn(10) * 10 + 30 rdt['pressure'] = [20] * 10 self.assertEquals(set(pdict.keys()), set(rdt.fields)) self.assertEquals(pdict.temporal_parameter_name, rdt.temporal_parameter) self.assertEquals(rdt._stream_config['reference_designator'],"GA03FLMA-RI001-13-CTDMOG999") self.rdt = rdt self.data_producer_id = 'data_producer' self.provider_metadata_update = {1:1} publisher.publish(rdt.to_granule(data_producer_id='data_producer', provider_metadata_update={1:1})) self.assertTrue(self.event.wait(10)) self.pubsub_management.deactivate_subscription(subscription_id) self.pubsub_management.delete_subscription(subscription_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.array([None,None,None]) self.assertTrue(rdt['time'] is None) rdt['time'] = np.array([None, 1, 2]) self.assertEquals(rdt['time'][0], rdt.fill_value('time')) stream_def_obj = self.pubsub_management.read_stream_definition(stream_def_id) rdt = RecordDictionaryTool(stream_definition=stream_def_obj) rdt['time'] = np.arange(20) rdt['temp'] = np.arange(20) granule = rdt.to_granule() rdt = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt['time'], np.arange(20)) np.testing.assert_array_equal(rdt['temp'], np.arange(20))
def _generate_stream_config(self): dsm = self.clients.dataset_management psm = self.clients.pubsub_management agent_obj = self._get_agent() device_obj = self._get_device() streams_dict = {} for stream_cfg in agent_obj.stream_configurations: #create a stream def for each param dict to match against the existing data products param_dict_id = dsm.read_parameter_dictionary_by_name(stream_cfg.parameter_dictionary_name, id_only=True) stream_def_id = psm.create_stream_definition(parameter_dictionary_id=param_dict_id) streams_dict[stream_cfg.stream_name] = {'param_dict_name':stream_cfg.parameter_dictionary_name, 'stream_def_id':stream_def_id, 'records_per_granule': stream_cfg.records_per_granule, 'granule_publish_rate':stream_cfg.granule_publish_rate, 'alarms' :stream_cfg.alarms } #retrieve the output products device_id = device_obj._id data_product_ids = self.RR2.find_data_product_ids_of_instrument_device_using_has_output_product(device_id) out_streams = [] for product_id in data_product_ids: stream_id = self.RR2.find_stream_id_of_data_product(product_id) out_streams.append(stream_id) stream_config = {} log.debug("Creating a stream config got each stream (dataproduct) assoc with this agent/device") for product_stream_id in out_streams: #get the streamroute object from pubsub by passing the stream_id stream_def_id = self.RR2.find_stream_definition_id_of_stream(product_stream_id) #match the streamdefs/apram dict for this model with the data products attached to this device to know which tag to use for model_stream_name, stream_info_dict in streams_dict.items(): if self.clients.pubsub_management.compare_stream_definition(stream_info_dict.get('stream_def_id'), stream_def_id): model_param_dict = DatasetManagementService.get_parameter_dictionary_by_name(stream_info_dict.get('param_dict_name')) stream_route = self.clients.pubsub_management.read_stream_route(stream_id=product_stream_id) stream_config[model_stream_name] = {'routing_key' : stream_route.routing_key, 'stream_id' : product_stream_id, 'stream_definition_ref' : stream_def_id, 'exchange_point' : stream_route.exchange_point, 'parameter_dictionary' : model_param_dict.dump(), 'records_per_granule' : stream_info_dict.get('records_per_granule'), 'granule_publish_rate' : stream_info_dict.get('granule_publish_rate'), 'alarms' : stream_info_dict.get('alarms') } log.debug("Stream config generated") log.trace("generate_stream_config: %s", str(stream_config) ) return stream_config
def test_empty_coverage_time(self): stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset( ) coverage = DatasetManagementService._get_coverage(dataset_id) temporal_bounds = self.dataset_management.dataset_temporal_bounds( dataset_id) self.assertEquals([coverage.get_parameter_context('time').fill_value] * 2, temporal_bounds)
def test_thorough_gap_analysis(self): dataset_id = self.test_ingestion_gap_analysis() vcov = DatasetManagementService._get_coverage(dataset_id) self.assertIsInstance(vcov,ViewCoverage) ccov = vcov.reference_coverage self.assertIsInstance(ccov, ComplexCoverage) self.assertEquals(len(ccov._reference_covs), 3)
def check_rsn_instrument_data_product(self): passing = True # for RS03AXBS-MJ03A-06-PRESTA301 (PREST-A) there are a few listed data products # Parsed, Engineering # SFLPRES-0 SFLPRES-1 # Check for the two data products and make sure they have the proper parameters # SFLPRES-0 should data_products, _ = self.RR.find_resources_ext(alt_id_ns='PRE', alt_id='RS03AXBS-MJ03A-06-PRESTA301_SFLPRES_L0_DPID', id_only=True) passing &=self.assertTrue(len(data_products)==1) if not data_products: return passing data_product_id = data_products[0] stream_defs, _ = self.RR.find_objects(data_product_id,PRED.hasStreamDefinition,id_only=False) passing &= self.assertTrue(len(stream_defs)==1) if not stream_defs: return passing # Assert that the stream definition has the correct reference designator stream_def = stream_defs[0] passing &= self.assertEquals(stream_def.stream_configuration['reference_designator'], 'RS03AXBS-MJ03A-06-PRESTA301') # Get the pdict and make sure that the parameters corresponding to the available fields # begin with the appropriate data product identifier pdict_ids, _ = self.RR.find_objects(stream_def, PRED.hasParameterDictionary, id_only=True) passing &= self.assertEquals(len(pdict_ids), 1) if not pdict_ids: return passing pdict_id = pdict_ids[0] pdict = DatasetManagementService.get_parameter_dictionary(pdict_id) available_params = [pdict.get_context(i) for i in pdict.keys() if i in stream_def.available_fields] for p in available_params: if p.name=='time': # Ignore the domain parameter continue passing &= self.assertTrue(p.ooi_short_name.startswith('SFLPRES')) passing &= self.check_presta_instrument_data_products('RS01SLBS-MJ01A-06-PRESTA101') passing &= self.check_vel3d_instrument_data_products( 'RS01SLBS-MJ01A-12-VEL3DB101') passing &= self.check_presta_instrument_data_products('RS03AXBS-MJ03A-06-PRESTA301') passing &= self.check_vel3d_instrument_data_products( 'RS03AXBS-MJ03A-12-VEL3DB301') passing &= self.check_tempsf_instrument_data_product( 'RS03ASHS-MJ03B-07-TMPSFA301') passing &= self.check_vel3d_instrument_data_products( 'RS03INT2-MJ03D-12-VEL3DB304') passing &= self.check_trhph_instrument_data_products( 'RS03INT1-MJ03C-10-TRHPHA301') self.data_product_management.activate_data_product_persistence(data_product_id) dataset_id = self.RR2.find_dataset_id_of_data_product_using_has_dataset(data_product_id) granule = self.data_retriever.retrieve(dataset_id) rdt = RecordDictionaryTool.load_from_granule(granule) self.assert_array_almost_equal(rdt['seafloor_pressure'], [10.2504], 4) self.assert_array_almost_equal(rdt['absolute_pressure'], [14.8670], 4) self.data_product_management.suspend_data_product_persistence(data_product_id) # Should do nothing and not raise anything return passing
def create_simple_qc_pdict(self): types_manager = TypesManager(self.dataset_management,None,None) contexts = self.create_simple_qc() context_ids = [i[1] for i in contexts.itervalues()] context_ids.extend(contexts['temp'][0].qc_contexts) for qc_context in contexts['temp'][0].qc_contexts: context_ids.extend(types_manager.get_lookup_value_ids(DatasetManagementService.get_parameter_context(qc_context))) context_ids.extend(contexts['pressure'][0].qc_contexts) for qc_context in contexts['pressure'][0].qc_contexts: context_ids.extend(types_manager.get_lookup_value_ids(DatasetManagementService.get_parameter_context(qc_context))) context_names = [self.dataset_management.read_parameter_context(i).name for i in context_ids] qc_names = [i for i in context_names if i.endswith('_qc')] ctxt_id, pc = types_manager.make_propagate_qc(qc_names) context_ids.append(ctxt_id) pdict_id = self.dataset_management.create_parameter_dictionary('simple_qc', parameter_context_ids=context_ids, temporal_context='time') self.addCleanup(self.dataset_management.delete_parameter_dictionary, pdict_id) return pdict_id
def test_thorough_gap_analysis(self): dataset_id = self.test_ingestion_gap_analysis() vcov = DatasetManagementService._get_coverage(dataset_id) self.assertIsInstance(vcov, ViewCoverage) ccov = vcov.reference_coverage self.assertIsInstance(ccov, ComplexCoverage) self.assertEquals(len(ccov._reference_covs), 3)
class DatasetManagementTest(PyonTestCase): def setUp(self): mock_clients = self._create_service_mock('dataset_management') self.dataset_management = DatasetManagementService() self.dataset_management.clients = mock_clients self.mock_rr_create = self.dataset_management.clients.resource_registry.create self.mock_rr_read = self.dataset_management.clients.resource_registry.read self.mock_rr_update = self.dataset_management.clients.resource_registry.update self.mock_rr_delete = self.dataset_management.clients.resource_registry.delete def test_create_dataset(self): # mocks self.mock_rr_create.return_value = ('dataset_id','rev') # execution dataset_id = self.dataset_management.create_dataset(name='123',stream_id='123',datastore_name='fake_datastore') # assertions self.assertEquals(dataset_id,'dataset_id') self.assertTrue(self.mock_rr_create.called) def test_update_dataset(self): # mocks mock_dataset = DotDict({'_id':'dataset_id'}) # execution self.dataset_management.update_dataset(mock_dataset) # assertions self.mock_rr_update.assert_called_with(mock_dataset) def test_delete_dataset(self): # mocks # execution self.dataset_management.delete_dataset('123') # assertions self.mock_rr_delete.assert_called_with('123')
def find_function(self, name): res_obj, _ = Container.instance.resource_registry.find_resources( name=name, restype=RT.ParameterFunction, id_only=False) if res_obj: return res_obj[ 0]._id, DatasetManagementService.get_coverage_function( res_obj[0]) else: raise KeyError('%s was never loaded' % name)
def test_pfunc_crud(self): contexts, funcs = self.create_pfuncs() context_ids = [context_id for ctxt,context_id in contexts.itervalues()] pdict_id = self.dataset_management.create_parameter_dictionary(name='functional_pdict', parameter_context_ids=context_ids, temporal_context='time') self.addCleanup(self.dataset_management.delete_parameter_dictionary, pdict_id) expr, expr_id = funcs['CONDWAT_L1'] func_class = DatasetManagementService.get_parameter_function(expr_id) self.assertIsInstance(func_class, NumexprFunction)
def test_get_data_from_FDW(self): # generate a data product and check that the FDW can get data ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_extended_parsed() stream_def_id = self.pubsub_management.create_stream_definition( 'example', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) dp = DataProduct(name='example') data_product_id = self.data_product_management.create_data_product( dp, stream_def_id) self.addCleanup(self.data_product_management.delete_data_product, data_product_id) self.data_product_management.activate_data_product_persistence( data_product_id) self.addCleanup( self.data_product_management.suspend_data_product_persistence, data_product_id) dataset_id = self.resource_registry.find_objects(data_product_id, PRED.hasDataset, id_only=True)[0][0] monitor = DatasetMonitor(dataset_id) self.addCleanup(monitor.stop) rdt = ph.get_rdt(stream_def_id) ph.fill_rdt(rdt, 100) ph.publish_rdt_to_data_product(data_product_id, rdt) self.assertTrue(monitor.event.wait(10)) gevent.sleep( 1) # Yield to other greenlets, had an issue with connectivity print "--------------------------------" print dataset_id coverage_path = DatasetManagementService()._get_coverage_path( dataset_id) print coverage_path print "--------------------------------" #verify table exists in the DB (similar to above) # ....code... # check that the geoserver layer exists as above # ... code .... # make a WMS/WFS request...somet like this (or both) url = self.gs_host + '/geoserver/geonode/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=geonode:ooi_' + dataset_id + '_ooi&maxFeatures=1&outputFormat=csv' r = requests.get(url) assertTrue(r.status_code == 200)
def test_create_dataset_verify_geoserver_layer(self): #generate layer and check that the service created it in geoserver ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_extended_parsed() stream_def_id = self.pubsub_management.create_stream_definition( 'example', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) dp = DataProduct(name='example') data_product_id = self.data_product_management.create_data_product( dp, stream_def_id) self.addCleanup(self.data_product_management.delete_data_product, data_product_id) self.data_product_management.activate_data_product_persistence( data_product_id) self.addCleanup( self.data_product_management.suspend_data_product_persistence, data_product_id) dataset_id = self.resource_registry.find_objects(data_product_id, PRED.hasDataset, id_only=True)[0][0] monitor = DatasetMonitor(dataset_id) self.addCleanup(monitor.stop) rdt = ph.get_rdt(stream_def_id) ph.fill_rdt(rdt, 100) ph.publish_rdt_to_data_product(data_product_id, rdt) self.assertTrue(monitor.event.wait(10)) gevent.sleep( 1) # Yield to other greenlets, had an issue with connectivity log.debug("--------------------------------") log.debug(dataset_id) coverage_path = DatasetManagementService()._get_coverage_path( dataset_id) log.debug(coverage_path) log.debug("--------------------------------") # verify that the layer exists in geoserver try: r = requests.get(self.gs_rest_url + '/layers/ooi_' + dataset_id + '_ooi.xml', auth=(self.username, self.PASSWORD)) self.assertTrue(r.status_code == 200) except Exception as e: log.error("check service and layer exist...%s", e) self.assertTrue(False)
def test_context_crud(self): context_ids = self.create_contexts() context_id = context_ids.pop() ctxt = self.dataset_management.read_parameter_context(context_id) context = DatasetManagementService.get_coverage_parameter(ctxt) self.assertIsInstance(context, CoverageParameterContext) self.dataset_management.delete_parameter_context(context_id) with self.assertRaises(NotFound): self.dataset_management.read_parameter_context(context_id)
def test_context_crud(self): context_ids = self.create_contexts() context_id = context_ids.pop() context = DatasetManagementService.get_parameter_context(context_id) self.assertIsInstance(context, ParameterContext) self.assertEquals(context.identifier, context_id) self.dataset_management.delete_parameter_context(context_id) with self.assertRaises(NotFound): self.dataset_management.read_parameter_context(context_id)
def get_last_values(cls, dataset_id, number_of_points): coverage = DatasetManagementService._get_coverage(dataset_id, mode='r') if coverage.num_timesteps < number_of_points: if coverage.num_timesteps == 0: rdt = RecordDictionaryTool( param_dictionary=coverage.parameter_dictionary) return rdt.to_granule() number_of_points = coverage.num_timesteps rdt = cls._coverage_to_granule(coverage, tdoa=slice(-number_of_points, None)) coverage.close(timeout=5) return rdt.to_granule()
def gap_coverage(self, stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage( dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result
def _replay(self): coverage = DatasetManagementService._get_coverage(self.dataset_id,mode='r') rdt = self._cov2granule(coverage=coverage, start_time=self.start_time, end_time=self.end_time, stride_time=self.stride_time, parameters=self.parameters, stream_def_id=self.stream_def_id) elements = len(rdt) for i in xrange(elements / self.publish_limit): outgoing = RecordDictionaryTool(stream_definition_id=self.stream_def_id) fields = self.parameters or outgoing.fields for field in fields: v = rdt[field] if v is not None: outgoing[field] = v[(i*self.publish_limit) : ((i+1)*self.publish_limit)] yield outgoing coverage.close(timeout=5) return
def test_pfunc_crud(self): contexts, funcs = self.create_pfuncs() context_ids = [context_id for context_id in contexts.itervalues()] pdict_id = self.dataset_management.create_parameter_dictionary( name='functional_pdict', parameter_context_ids=context_ids, temporal_context='time') self.addCleanup(self.dataset_management.delete_parameter_dictionary, pdict_id) expr_id = funcs['CONDWAT_L1'] expr = self.dataset_management.read_parameter_function(expr_id) func_class = DatasetManagementService.get_coverage_function(expr) self.assertIsInstance(func_class, NumexprFunction)
def test_retrieve_cache(self): DataRetrieverService._refresh_interval = 1 datasets = [self.make_simple_dataset() for i in xrange(10)] for stream_id, route, stream_def_id, dataset_id in datasets: coverage = DatasetManagementService._get_simplex_coverage( dataset_id) coverage.insert_timesteps(10) coverage.set_parameter_values('time', np.arange(10)) coverage.set_parameter_values('temp', np.arange(10)) # Verify cache hit and refresh dataset_ids = [i[3] for i in datasets] self.assertTrue( dataset_ids[0] not in DataRetrieverService._retrieve_cache) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]] # Verify that it was hit and it's now in there self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache) gevent.sleep(DataRetrieverService._refresh_interval + 0.2) DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]] self.assertTrue(age2 != age) for dataset_id in dataset_ids: DataRetrieverService._get_coverage(dataset_id) self.assertTrue( dataset_ids[0] not in DataRetrieverService._retrieve_cache) stream_id, route, stream_def, dataset_id = datasets[0] self.start_ingestion(stream_id, dataset_id) DataRetrieverService._get_coverage(dataset_id) self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache) DataRetrieverService._refresh_interval = 100 self.publish_hifi(stream_id, route, 1) self.wait_until_we_have_enough_granules(dataset_id, data_size=20) event = gevent.event.Event() with gevent.Timeout(20): while not event.wait(0.1): if dataset_id not in DataRetrieverService._retrieve_cache: event.set() self.assertTrue(event.is_set())