def test_qc_attachment(self): instrument_device = InstrumentDevice(name='whatever') instrument_device_id,_ = self.rrclient.create(instrument_device) self.addCleanup(self.rrclient.delete, instrument_device_id) self.client.register_instrument(instrument_device_id) self.addCleanup(self.client.unregister_instrument, instrument_device_id) dp = DataProduct(name='instrument output') dp_id,_ = self.rrclient.create(dp) self.addCleanup(self.rrclient.delete, dp_id) parser_id = self.make_grt_parser() attachment = Attachment(name='qc ref', attachment_type=AttachmentType.REFERENCE,content=global_range_test_document, context=ReferenceAttachmentContext(parser_id=parser_id)) att_id = self.rrclient.create_attachment(dp_id, attachment) self.addCleanup(self.rrclient.delete_attachment, att_id) attachment2 = Attachment(name='qc ref2', attachment_type=AttachmentType.REFERENCE, content=global_range_test_document2, context=ReferenceAttachmentContext(parser_id=parser_id)) att2_id = self.rrclient.create_attachment(dp_id, attachment2) self.addCleanup(self.rrclient.delete_attachment, att2_id) self.client.assign_data_product(instrument_device_id, dp_id) self.addCleanup(self.client.unassign_data_product, instrument_device_id, dp_id) svm = StoredValueManager(self.container) doc = svm.read_value('grt_CE01ISSM-MF005-01-CTDBPC999_TEMPWAT') np.testing.assert_array_almost_equal(doc['grt_min_value'], -2.)
def fetch_lookup_values(self): doc_keys = [] for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if '$designator' in context.document_key and 'reference_designator' in self._stream_config: document_key = document_key.replace( '$designator', self._stream_config['reference_designator']) doc_keys.append(document_key) lookup_docs = {} if doc_keys: svm = StoredValueManager(Container.instance) doc_list = svm.read_value_mult(doc_keys) lookup_docs = dict(zip(doc_keys, doc_list)) for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if '$designator' in context.document_key and 'reference_designator' in self._stream_config: document_key = document_key.replace( '$designator', self._stream_config['reference_designator']) doc = lookup_docs[document_key] if doc is None: log.debug('Reference Document for %s not found', document_key) continue if context.lookup_value in doc: self[lv] = [doc[context.lookup_value] ] * self._shp[0] if self._shp else doc[ context.lookup_value]
def fetch_lookup_values(self): doc_keys = [] for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if "$designator" in context.document_key and "reference_designator" in self._stream_config: document_key = document_key.replace("$designator", self._stream_config["reference_designator"]) doc_keys.append(document_key) lookup_docs = {} if doc_keys: svm = StoredValueManager(Container.instance) doc_list = svm.read_value_mult(doc_keys) lookup_docs = dict(zip(doc_keys, doc_list)) for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if "$designator" in context.document_key and "reference_designator" in self._stream_config: document_key = document_key.replace("$designator", self._stream_config["reference_designator"]) doc = lookup_docs[document_key] if doc is None: log.debug("Reference Document for %s not found", document_key) continue if context.lookup_value in doc: self[lv] = [doc[context.lookup_value]] * self._shp[0] if self._shp else doc[context.lookup_value]
class StoredValueTransform(TransformStreamListener): ''' Receives granules from a stream and persists the latest value in the object store Background: Platforms publish geospatial information on a separate stream from the instruments. Various data products require geospatial information about the instrument to calculate the variables. This component persists the latest value in a simple data storage container where complex data containers can access it. ''' def on_start(self): TransformStreamListener.on_start(self) self.document_key = self.CFG.get_safe('process.document_key') self.stored_value_manager = StoredValueManager(self.container) def recv_packet(self, msg, route, stream_id): rdt = RecordDictionaryTool.load_from_granule(msg) document = {} for k,v in rdt.iteritems(): value_array = np.atleast_1d(v[:]) if 'f' in value_array.dtype.str: document[k] = float(value_array[-1]) elif 'i' in value_array.dtype.str: document[k] = int(value_array[-1]) self.stored_value_manager.stored_value_cas(self.document_key, document)
class StoredValueTransform(TransformStreamListener): ''' Receives granules from a stream and persists the latest value in the object store Background: Platforms publish geospatial information on a separate stream from the instruments. Various data products require geospatial information about the instrument to calculate the variables. This component persists the latest value in a simple data storage container where complex data containers can access it. ''' def on_start(self): TransformStreamListener.on_start(self) self.document_key = self.CFG.get_safe('process.document_key') self.stored_value_manager = StoredValueManager(self.container) def recv_packet(self, msg, route, stream_id): rdt = RecordDictionaryTool.load_from_granule(msg) document = {} for k, v in rdt.iteritems(): value_array = np.atleast_1d(v[:]) if 'f' in value_array.dtype.str: document[k] = float(value_array[-1]) elif 'i' in value_array.dtype.str: document[k] = int(value_array[-1]) self.stored_value_manager.stored_value_cas(self.document_key, document)
def parse_document(cls, container, parser, document_path): svm = StoredValueManager(container) document = '' with open(document_path,'r') as f: document = f.read() for k,v in parser(document): svm.stored_value_cas(k,v) return
def parse_document(cls, container, parser, document_path): svm = StoredValueManager(container) document = '' with open(document_path, 'r') as f: document = f.read() for k, v in parser(document): svm.stored_value_cas(k, v) return
class TestParsers(IonIntegrationTestCase): def setUp(self): self._start_container() self.svm = StoredValueManager(self.container) @classmethod def parse_document(cls, container, parser, document_path): svm = StoredValueManager(container) document = '' with open(document_path,'r') as f: document = f.read() for k,v in parser(document): svm.stored_value_cas(k,v) return def test_grt_parser(self): self.parse_document(self.container, grt_parser, qc_paths['grt']) ret_doc = self.svm.read_value('grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT')) np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.) np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.) def test_spike_parser(self): self.parse_document(self.container, spike_parser, qc_paths['spike']) ret_doc = self.svm.read_value('spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['acc'], 0.005) np.testing.assert_almost_equal(ret_doc['spike_n'], 11.) np.testing.assert_almost_equal(ret_doc['spike_l'], 15.) self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN')) def test_stuck_value_parser(self): self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck']) ret_doc = self.svm.read_value('svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9) np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000) def test_trend_value_parser(self): self.parse_document(self.container, trend_parser, qc_paths['trend']) ret_doc = self.svm.read_value('trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0) np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0) np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0) def test_lrt_parser(self): self.parse_document(self.container, lrt_parser, qc_paths['lrt']) ret_doc = self.svm.read_value('lrt_%s_%s' %('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL')) np.testing.assert_array_equal(ret_doc['datlim'][0], np.array([32.289, 32.927])) self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
def test_global_range_test_parser(self): svm = StoredValueManager(self.container) for key,doc in grt_parser(grt_sample_doc): svm.stored_value_cas(key,doc) self.addCleanup(svm.delete_stored_value,key) doc = svm.read_value('grt_sbe37-abc123_TEMPWAT_UHHH') self.assertEquals(doc['grt_min_value'], 0.) self.assertEquals(doc['array'],'array 1') doc = svm.read_value('grt_sbe37-abc123_PRESWAT_flagged') self.assertEquals(doc['grt_max_value'], 689.47)
def fetch_lookup_values(self): for lv in self._lookup_values(): context = self.context(lv) if context.document_key: svm = StoredValueManager(Container.instance) try: doc = svm.read_value(context.document_key) except NotFound: continue if context.lookup_value in doc: self[lv] = doc[context.lookup_value]
def test_stuck_value_test(self): svm = StoredValueManager(self.container) for key, doc in trend_parser(trend_value_test_sample_doc): svm.stored_value_cas(key, doc) self.addCleanup(svm.delete_stored_value, key) doc = svm.read_value("trend_ssxbt-ssn719_PRESSURE") self.assertEquals(doc["time_interval"], 1.0) self.assertEquals(doc["standard_deviation"], 0.0) doc = svm.read_value("trend_ssxbt-ssn719_COND") self.assertEquals(doc["time_interval"], 3.0) self.assertEquals(doc["polynomial_order"], "third")
def test_gradient_test(self): svm = StoredValueManager(self.container) for key,doc in gradient_test_parser(gradient_test_sample_doc): svm.stored_value_cas(key,doc) self.addCleanup(svm.delete_stored_value,key) doc = svm.read_value('grad_CHEL-C-32-12_PRESSURE_DENSITY') self.assertEquals(doc['units_dat'], 'dbar') self.assertEquals(doc['d_dat_dx'], 2.781) doc = svm.read_value('grad_CHEW-BA-C-A12_PRESSURE_DENSITY') self.assertEquals(doc['units_x'], 'kg m-3') self.assertEquals(doc['tol_dat'], 12.)
def test_stuck_value_test(self): svm = StoredValueManager(self.container) for key,doc in stuck_value_test_parser(stuck_value_test_sample_doc): svm.stored_value_cas(key,doc) self.addCleanup(svm.delete_stored_value,key) doc = svm.read_value('svt_ssxbt-ssn719_PRESSURE') self.assertEquals(doc['svt_resolution'], 1.2) self.assertEquals(doc['svt_n'], 10.) doc = svm.read_value('svt_ssxbt-ssn719_COND') self.assertEquals(doc['svt_resolution'], 0.2) self.assertEquals(doc['units'], 'S m-1')
def fetch_lookup_values(self): for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if '$designator' in context.document_key and 'reference_designator' in self._stream_config: document_key = document_key.replace('$designator',self._stream_config['reference_designator']) svm = StoredValueManager(Container.instance) try: doc = svm.read_value(document_key) except NotFound: log.debug('Reference Document for %s not found', document_key) continue if context.lookup_value in doc: self[lv] = [doc[context.lookup_value]] * self._shp[0] if self._shp else doc[context.lookup_value]
class TestParsers(IonIntegrationTestCase): def setUp(self): self._start_container() self.svm = StoredValueManager(self.container) @classmethod def parse_document(cls, container, parser, document_path): svm = StoredValueManager(container) document = '' with open(document_path,'r') as f: document = f.read() for k,v in parser(document): svm.stored_value_cas(k,v) return def test_grt_parser(self): self.parse_document(self.container, grt_parser, qc_paths['grt']) ret_doc = self.svm.read_value('grt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'CONDWAT')) np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.) np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.) def test_spike_parser(self): self.parse_document(self.container, spike_parser, qc_paths['spike']) ret_doc = self.svm.read_value('spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['acc'], 0.005) np.testing.assert_almost_equal(ret_doc['spike_n'], 11.) np.testing.assert_almost_equal(ret_doc['spike_l'], 15.) self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'DENSITY')) def test_stuck_value_parser(self): self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck']) ret_doc = self.svm.read_value('svt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['svt_resolution'], 0.0001) np.testing.assert_almost_equal(ret_doc['svt_n'], 20.) def test_trend_value_parser(self): self.parse_document(self.container, trend_parser, qc_paths['trend']) ret_doc = self.svm.read_value('trend_%s_%s' % ('RS01SBVM-LJ01A-05-HPIESA101', 'IESPRES')) np.testing.assert_almost_equal(ret_doc['time_interval'], 90.) np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0) np.testing.assert_almost_equal(ret_doc['standard_deviation'], 5.0)
def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.connection_id = '' self.connection_index = None self.start_listener()
def on_start(self): TransformDataProcess.on_start(self) self.pubsub_management = PubsubManagementServiceProcessClient( process=self) self.stored_values = StoredValueManager(self.container) self.input_data_product_ids = self.CFG.get_safe( 'process.input_products', []) self.output_data_product_ids = self.CFG.get_safe( 'process.output_products', []) self.lookup_docs = self.CFG.get_safe('process.lookup_docs', []) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber( event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start()
def test_rdt_lookup(self): rdt = self.create_lookup_rdt() self.assertTrue('offset_a' in rdt.lookup_values()) self.assertFalse('offset_b' in rdt.lookup_values()) rdt['time'] = [0] rdt['temp'] = [10.0] rdt['offset_a'] = [2.0] self.assertEquals(rdt['offset_b'], None) self.assertEquals(rdt.lookup_values(), ['offset_a']) np.testing.assert_array_almost_equal(rdt['calibrated'], np.array([12.0])) svm = StoredValueManager(self.container) svm.stored_value_cas('coefficient_document', {'offset_b':2.0}) rdt.fetch_lookup_values() np.testing.assert_array_equal(rdt['offset_b'], np.array([2.0])) np.testing.assert_array_equal(rdt['calibrated_b'], np.array([14.0]))
def setUp(self): DMTestCase.setUp(self) self.ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = self.ph.create_simple_qc_pdict() self.stream_def_id = self.pubsub_management.create_stream_definition('global range', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':'QCTEST'}) self.addCleanup(self.pubsub_management.delete_stream_definition, self.stream_def_id) self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id) self.svm = StoredValueManager(self.container)
def on_start(self): TransformDataProcess.on_start(self) self.pubsub_management = PubsubManagementServiceProcessClient(process=self) self.stored_values = StoredValueManager(self.container) self.input_data_product_ids = self.CFG.get_safe('process.input_products', []) self.output_data_product_ids = self.CFG.get_safe('process.output_products', []) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent,callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start()
def fetch_lookup_values(self): for lv in self._lookup_values(): context = self.context(lv) if context.document_key: document_key = context.document_key if '$designator' in context.document_key and 'reference_designator' in self._stream_config: document_key = document_key.replace( '$designator', self._stream_config['reference_designator']) svm = StoredValueManager(Container.instance) try: doc = svm.read_value(document_key) except NotFound: log.debug('Reference Document for %s not found', document_key) continue if context.lookup_value in doc: self[lv] = [doc[context.lookup_value] ] * self._shp[0] if self._shp else doc[ context.lookup_value]
def test_global_range_lookup(self): reference_designator = "CE01ISSM-MF005-01-CTDBPC999" ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_simple_qc_pdict() svm = StoredValueManager(self.container) doc_key = 'grt_%s_TEMPWAT' % reference_designator svm.stored_value_cas(doc_key, {'grt_min_value':-2, 'grt_max_value':40}) stream_def_id = self.pubsub_management.create_stream_definition('qc parsed', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':reference_designator}) self.addCleanup(self.pubsub_management.delete_stream_definition,stream_def_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [0] rdt['temp'] = [20] rdt.fetch_lookup_values() min_field = [i for i in rdt.fields if 'grt_min_value' in i][0] max_field = [i for i in rdt.fields if 'grt_max_value' in i][0] np.testing.assert_array_almost_equal(rdt[min_field], [-2.]) np.testing.assert_array_almost_equal(rdt[max_field], [40.]) np.testing.assert_array_almost_equal(rdt['tempwat_glblrng_qc'],[1])
def populate_qc_tables(self): svm = StoredValueManager(self.container) svm.stored_value_cas('grt_QCTEST_TEMPWAT', { 'grt_min_value': -2., 'grt_max_value': 40. }) svm.stored_value_cas('svt_QCTEST_TEMPWAT', { 'svt_resolution': 0.001, 'svt_n': 4 }) svm.stored_value_cas('spike_QCTEST_TEMPWAT', { 'acc': 0.1, 'spike_n': 5, 'spike_l': 5 })
def on_start(self): #pragma no cover super(ScienceGranuleIngestionWorker,self).on_start() self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start() self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None
def test_basic_stored_docs(self): doc = {'key':'value'} doc_key = 'doc_id' svm = StoredValueManager(self.container) doc_id, rev = svm.stored_value_cas(doc_key, doc) self.addCleanup(svm.delete_stored_value, doc_key) doc = svm.read_value(doc_key) self.assertTrue('key' in doc and doc['key']=='value') svm.stored_value_cas(doc_key,{'key2':'value2'}) doc = svm.read_value(doc_key) self.assertTrue('key' in doc and doc['key']=='value') self.assertTrue('key2' in doc and doc['key2']=='value2')
def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True) if not self.ignore_gaps: log.warning("Gap handling is not supported in release 2") self.ignore_gaps = True self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener()
def test_rdt_lookup(self): rdt = self.create_lookup_rdt() self.assertTrue('offset_a' in rdt.lookup_values()) self.assertFalse('offset_b' in rdt.lookup_values()) rdt['time'] = [0] rdt['temp'] = [10.0] rdt['offset_a'] = [2.0] self.assertEquals(rdt['offset_b'], None) self.assertEquals(rdt.lookup_values(), ['offset_a']) np.testing.assert_array_almost_equal(rdt['calibrated'], np.array([12.0])) svm = StoredValueManager(self.container) svm.stored_value_cas('coefficient_document', {'offset_b':2.0}) svm.stored_value_cas("GA03FLMA-RI001-13-CTDMOG999_OFFSETC", {'offset_c':3.0}) rdt.fetch_lookup_values() np.testing.assert_array_equal(rdt['offset_b'], np.array([2.0])) np.testing.assert_array_equal(rdt['calibrated_b'], np.array([14.0])) np.testing.assert_array_equal(rdt['offset_c'], np.array([3.0]))
class TestQCFunctions(DMTestCase): def setUp(self): DMTestCase.setUp(self) self.ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = self.ph.create_simple_qc_pdict() self.stream_def_id = self.pubsub_management.create_stream_definition('global range', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':'QCTEST'}) self.addCleanup(self.pubsub_management.delete_stream_definition, self.stream_def_id) self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id) self.svm = StoredValueManager(self.container) def test_global_range_test(self): self.svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':10., 'grt_max_value':20.}) self.rdt['time'] = np.arange(8) self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_glblrng_qc'], [0, 1, 1, 1, 1, 1, 1, 0]) def test_spike_test(self): # I know how redundant this sounds self.svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc':0.1, 'spike_n':5., 'spike_l':5.}) self.rdt['time'] = np.arange(8) self.rdt['temp'] = [-1, 3, 40, -1, 1, -6, -6, 1] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_spketst_qc'], [1, 1, 0, 1, 1, 1, 1, 1]) def test_stuck_value_test(self): self.svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4.}) self.rdt['time'] = np.arange(10) self.rdt['temp'] = [4.83, 1.40, 3.33, 3.33, 3.33, 3.33, 4.09, 2.97, 2.85, 3.67] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_stuckvl_qc'], [1, 1, 0, 0, 0, 0, 1, 1, 1, 1])
def setUp(self): self._start_container() self.svm = StoredValueManager(self.container)
class TestParsers(IonIntegrationTestCase): def setUp(self): self._start_container() self.svm = StoredValueManager(self.container) @classmethod def parse_document(cls, container, parser, document_path): svm = StoredValueManager(container) document = '' with open(document_path, 'r') as f: document = f.read() for k, v in parser(document): svm.stored_value_cas(k, v) return def test_grt_parser(self): self.parse_document(self.container, grt_parser, qc_paths['grt']) ret_doc = self.svm.read_value( 'grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT')) np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.) np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.) def test_spike_parser(self): self.parse_document(self.container, spike_parser, qc_paths['spike']) ret_doc = self.svm.read_value( 'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['acc'], 0.005) np.testing.assert_almost_equal(ret_doc['spike_n'], 11.) np.testing.assert_almost_equal(ret_doc['spike_l'], 15.) self.assertRaises( NotFound, self.svm.read_value, 'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN')) def test_stuck_value_parser(self): self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck']) ret_doc = self.svm.read_value( 'svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9) np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000) def test_trend_value_parser(self): self.parse_document(self.container, trend_parser, qc_paths['trend']) ret_doc = self.svm.read_value( 'trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL')) np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0) np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0) np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0) def test_lrt_parser(self): self.parse_document(self.container, lrt_parser, qc_paths['lrt']) ret_doc = self.svm.read_value( 'lrt_%s_%s' % ('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL')) np.testing.assert_array_equal(ret_doc['datlim'][0], np.array([32.289, 32.927])) self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
def test_lookup_values(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsubcli.create_stream_definition( 'lookup', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsubcli.delete_stream_definition, stream_def_id) data_product = DataProduct(name='lookup data product') tdom, sdom = time_series_domain() data_product.temporal_domain = tdom.dump() data_product.spatial_domain = sdom.dump() data_product_id = self.dpsc_cli.create_data_product( data_product, stream_definition_id=stream_def_id) self.addCleanup(self.dpsc_cli.delete_data_product, data_product_id) data_producer = DataProducer(name='producer') data_producer.producer_context = DataProcessProducerContext() data_producer.producer_context.configuration['qc_keys'] = [ 'offset_document' ] data_producer_id, _ = self.rrclient.create(data_producer) self.addCleanup(self.rrclient.delete, data_producer_id) assoc, _ = self.rrclient.create_association( subject=data_product_id, object=data_producer_id, predicate=PRED.hasDataProducer) self.addCleanup(self.rrclient.delete_association, assoc) document_keys = self.damsclient.list_qc_references(data_product_id) self.assertEquals(document_keys, ['offset_document']) svm = StoredValueManager(self.container) svm.stored_value_cas('offset_document', {'offset_a': 2.0}) self.dpsc_cli.activate_data_product_persistence(data_product_id) dataset_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasDataset, id_only=True) dataset_id = dataset_ids[0] dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [0] rdt['temp'] = [20.] granule = rdt.to_granule() stream_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasStream, id_only=True) stream_id = stream_ids[0] route = self.pubsubcli.read_stream_route(stream_id=stream_id) publisher = StandaloneStreamPublisher(stream_id, route) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(10)) granule = self.data_retriever.retrieve(dataset_id) rdt2 = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt['temp'], rdt2['temp']) np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0])) svm.stored_value_cas('updated_document', {'offset_a': 3.0}) dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) ep = EventPublisher(event_type=OT.ExternalReferencesUpdatedEvent) ep.publish_event(origin=data_product_id, reference_keys=['updated_document']) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [1] rdt['temp'] = [20.] granule = rdt.to_granule() gevent.sleep(2) # Yield so that the event goes through publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(10)) granule = self.data_retriever.retrieve(dataset_id) rdt2 = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt2['temp'], np.array([20., 20.])) np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0, 23.0]))
def setUp(self): DMTestCase.setUp(self) self.ph = ParameterHelper(self.dataset_management, self.addCleanup) self.pdict_id = self.ph.create_simple_qc_pdict() self.svm = StoredValueManager(self.container)
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5) def __init__(self, *args, **kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint( from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs', []) self.input_product = self.CFG.get_safe('process.input_product', '') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', False) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber( event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() self.qc_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn( self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = ResourceRegistryServiceClient() datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset, predicate=PRED.hasStream, object=stream_id, id_only=True) if datasets: return datasets[0] return None def get_dataset(self, stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self, stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage( dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.where(values == 0) timestamps = rdt[rdt.temporal_parameter][ topology[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): data_product_ids, _ = self.container.resource_registry.find_subjects( object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True) for data_product_id in data_product_ids: description = 'Automated Quality Control Alerted on %s' % parameter self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index + 1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error( "We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path( dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt.fields: if rdt[field] is None: continue if not isinstance( rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except ValueError as e: if "'lower_bound' cannot be >= 'upper_bound'" in e.message: continue else: raise except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k, v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units( coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self, stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info( 'Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- if not self.ignore_gaps: gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.error( 'Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error( "Couldn't open coverage: %s", DatasetManagementService._get_coverage_path( self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug( '%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error( 'Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) if rdt[rdt.temporal_parameter] is None: elements = 0 self.insert_sparse_values(coverage, rdt, stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements self.dataset_changed(dataset_id, coverage.num_timesteps, (start_index, start_index + elements)) if not self.ignore_gaps and gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY > 0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)
class TestQCFunctions(DMTestCase): def setUp(self): DMTestCase.setUp(self) self.ph = ParameterHelper(self.dataset_management, self.addCleanup) self.pdict_id = self.ph.create_simple_qc_pdict() self.svm = StoredValueManager(self.container) def new_rdt(self,ref='QCTEST'): self.stream_def_id = self.create_stream_definition(uuid4().hex, parameter_dictionary_id=self.pdict_id, stream_configuration={'reference_designator':'QCTEST'}) self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id) def test_qc_functions(self): self.check_global_range() self.check_spike() self.check_stuck_value() self.check_trend() self.check_gradient() self.check_localrange() self.check_propagate() def check_global_range(self): log.info('check_global_range') self.new_rdt() self.svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':10., 'grt_max_value':20.}) self.rdt['time'] = np.arange(8) self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_glblrng_qc'], [0, 1, 1, 1, 1, 1, 1, 0]) def check_spike(self): log.info('check_spike') self.new_rdt() self.svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc':0.1, 'spike_n':5., 'spike_l':5.}) self.rdt['time'] = np.arange(8) self.rdt['temp'] = [-1, 3, 40, -1, 1, -6, -6, 1] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_spketst_qc'], [1, 1, 0, 1, 1, 1, 1, 1]) def check_stuck_value(self): log.info('check_stuck_value') self.new_rdt() self.svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4.}) self.rdt['time'] = np.arange(10) self.rdt['temp'] = [4.83, 1.40, 3.33, 3.33, 3.33, 3.33, 4.09, 2.97, 2.85, 3.67] self.rdt.fetch_lookup_values() np.testing.assert_array_almost_equal(self.rdt['tempwat_stuckvl_qc'], [1, 1, 0, 0, 0, 0, 1, 1, 1, 1]) def check_trend(self): log.info('check_trend') self.new_rdt() self.svm.stored_value_cas('trend_QCTEST_TEMPWAT', {'time_interval':0, 'polynomial_order': 1, 'standard_deviation': 3}) self.rdt['time'] = np.arange(10) self.rdt['temp'] = [0.8147, 0.9058, 0.1270, 0.9134, 0.6324, 0.0975, 0.2785, 0.5469, 0.9575, 0.9649] self.rdt.fetch_lookup_values() np.testing.assert_array_equal(self.rdt['tempwat_trndtst_qc'], [1] * 10) def check_propagate(self): log.info('check_propagate') self.new_rdt() self.rdt['time'] = np.arange(8) self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25] self.rdt['tempwat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['tempwat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['tempwat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['tempwat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['tempwat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['tempwat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] self.rdt['preswat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0] np.testing.assert_array_equal(self.rdt['cmbnflg_qc'], [0, 1, 1, 1, 1, 1, 1, 0]) def check_gradient(self): log.info('check_gradient') self.new_rdt() self.svm.stored_value_cas('grad_QCTEST_TEMPWAT_time', {'d_dat_dx': 50, 'min_dx': 0, 'start_dat': 0, 'tol_dat': 5}) self.rdt['time'] = np.arange(5) self.rdt['temp'] = [3, 5, 98, 99, 4] self.rdt.fetch_lookup_values() np.testing.assert_array_equal(self.rdt['tempwat_gradtst_qc'], [1, 1, 0, 0, 1]) def check_localrange(self): log.info('check_localrange') self.new_rdt() t = np.array([3580144703.7555027, 3580144704.7555027, 3580144705.7555027, 3580144706.7555027, 3580144707.7555027, 3580144708.7555027, 3580144709.7555027, 3580144710.7555027, 3580144711.7555027, 3580144712.7555027]) pressure = np.random.rand(10) * 2 + 33.0 t_v = ntp_to_month(t) dat = t_v + pressure + np.arange(16,26) def lim1(p,m): return p+m+10 def lim2(p,m): return p+m+20 pressure_grid, month_grid = np.meshgrid(np.arange(0,150,10), np.arange(11)) points = np.column_stack([pressure_grid.flatten(), month_grid.flatten()]) datlim_0 = lim1(points[:,0], points[:,1]) datlim_1 = lim2(points[:,0], points[:,1]) datlim = np.column_stack([datlim_0, datlim_1]) datlimz = points self.svm.stored_value_cas('lrt_QCTEST_TEMPWAT', {'datlim':datlim.tolist(), 'datlimz':datlimz.tolist(), 'dims':['pressure', 'month']}) self.rdt['time'] = t self.rdt['temp'] = dat self.rdt['pressure'] = pressure self.rdt.fetch_lookup_values() np.testing.assert_array_equal(self.rdt['tempwat_loclrng_qc'], [1 ,1 ,1 ,1 ,1 ,0 ,0 ,0 ,0 ,0])
def test_lookup_values_ingest_replay(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsub_management.create_stream_definition( 'lookups', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) stream_id, route = self.pubsub_management.create_stream( 'example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream, stream_id) ingestion_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) config = DotDict() config.process.lookup_docs = ['test1', 'test2'] self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config) self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id) stored_value_manager = StoredValueManager(self.container) stored_value_manager.stored_value_cas('test1', { 'offset_a': 10.0, 'offset_b': 13.1 }) publisher = StandaloneStreamPublisher(stream_id, route) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.] * 20)) np.testing.assert_array_equal( rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20)) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20, 40) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor.event.clear() stored_value_manager.stored_value_cas('test1', {'offset_a': 20.0}) stored_value_manager.stored_value_cas('coefficient_document', {'offset_b': 10.0}) gevent.sleep(2) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.] * 20 + [40.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))
class TransformPrime(TransformDataProcess): binding=['output'] ''' Transforms which have an incoming stream and an outgoing stream. Parameters: process.stream_id Outgoing stream identifier. process.exchange_point Route's exchange point. process.routing_key Route's routing key. process.queue_name Name of the queue to listen on. process.routes streams,actor for each route {(stream_input_id, stream_output_id):actor} Either the stream_id or both the exchange_point and routing_key need to be provided. ''' def on_start(self): TransformDataProcess.on_start(self) self.pubsub_management = PubsubManagementServiceProcessClient(process=self) self.stored_values = StoredValueManager(self.container) self.input_data_product_ids = self.CFG.get_safe('process.input_products', []) self.output_data_product_ids = self.CFG.get_safe('process.output_products', []) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent,callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start() def on_quit(self): self.lookup_monitor.stop() TransformDataProcess.on_quit(self) def _add_lookups(self, event, *args, **kwargs): if event.origin in self.input_data_product_ids + self.output_data_product_ids: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) @memoize_lru(100) def read_stream_def(self,stream_id): return self.pubsub_management.read_stream_definition(stream_id=stream_id) def recv_packet(self, msg, stream_route, stream_id): process_routes = self.CFG.get_safe('process.routes', {}) for stream_in_id,routes in process_routes.iteritems(): if stream_id == stream_in_id: for stream_out_id, actor in routes.iteritems(): if actor is None: rdt_out = self._execute_transform(msg, (stream_in_id, stream_out_id)) self.publish(rdt_out.to_granule(), stream_out_id) else: outgoing = self._execute_actor(msg, actor, (stream_in_id, stream_out_id)) self.publish(outgoing, stream_out_id) def publish(self, msg, stream_out_id): publisher = getattr(self, stream_out_id) publisher.publish(msg) def _load_actor(self, actor): ''' Returns callable execute method if it exists, otherwise it raises a BadRequest ''' try: module = __import__(actor['module'], fromlist=['']) except ImportError: log.exception('Actor could not be loaded') raise try: cls = getattr(module, actor['class']) except AttributeError: log.exception('Module %s does not have class %s', repr(module), actor['class']) raise try: execute = getattr(cls,'execute') except AttributeError: log.exception('Actor class does not contain execute method') raise return execute def _execute_actor(self, msg, actor, streams): stream_in_id,stream_out_id = streams stream_def_out = self.read_stream_def(stream_out_id) params = self.CFG.get_safe('process.params', {}) config = self.CFG.get_safe('process') #do the stuff with the actor params['stream_def'] = stream_def_out._id executor = self._load_actor(actor) try: rdt_out = executor(msg, None, config, params, None) except: log.exception('Error running actor for %s', self.id) raise return rdt_out def _merge_pdicts(self, pdict1, pdict2): incoming_pdict = ParameterDictionary.load(pdict1) outgoing_pdict = ParameterDictionary.load(pdict2) merged_pdict = ParameterDictionary() for k,v in incoming_pdict.iteritems(): ordinal, v = v if k not in merged_pdict: merged_pdict.add_context(v) for k,v in outgoing_pdict.iteritems(): ordinal, v = v if k not in merged_pdict: merged_pdict.add_context(v) return merged_pdict def _merge_rdt(self, stream_def_in, stream_def_out): incoming_pdict_dump = stream_def_in.parameter_dictionary outgoing_pdict_dump = stream_def_out.parameter_dictionary merged_pdict = self._merge_pdicts(incoming_pdict_dump, outgoing_pdict_dump) rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict) return rdt_temp def _get_lookup_value(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_values.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def _execute_transform(self, msg, streams): stream_in_id,stream_out_id = streams stream_def_in = self.read_stream_def(stream_in_id) stream_def_out = self.read_stream_def(stream_out_id) rdt_temp = self._merge_rdt(stream_def_in, stream_def_out) rdt_in = RecordDictionaryTool.load_from_granule(msg) for field in rdt_temp.fields: if not isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType): try: rdt_temp[field] = rdt_in[field] except KeyError: pass rdt_temp.fetch_lookup_values() for lookup_field in rdt_temp.lookup_values(): s = lookup_field stored_value = self._get_lookup_value(rdt_temp.context(s).lookup_value) if stored_value is not None: rdt_temp[s] = stored_value for field in rdt_temp.fields: if isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType): rdt_temp[field] = rdt_temp[field] rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id) for field in rdt_out.fields: rdt_out[field] = rdt_temp[field] return rdt_out
class ScienceGranuleIngestionWorker(TransformStreamListener): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) def __init__(self, *args,**kwargs): super(ScienceGranuleIngestionWorker, self).__init__(*args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover super(ScienceGranuleIngestionWorker,self).on_start() self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start() self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None def on_quit(self): #pragma no cover super(ScienceGranuleIngestionWorker, self).on_quit() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = ResourceRegistryServiceClient() datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True) if datasets: return datasets[0] return None def get_dataset(self,stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self,stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not field.endswith('_qc'): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.nonzero(values) first_occurrence = topology[0][0] ts = rdt[rdt.temporal_parameter][first_occurrence] self.flag_qc_parameter(dataset_id, field, ts, {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_value, configuration): self.qc_publisher.publish_event(origin=dataset_id, qc_parameter=parameter, temporal_value=temporal_value, configuration=configuration) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index+1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt._lookup_values(): if rdt[field] is None: continue if not isinstance(rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k,v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.error('Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug('%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) self.insert_sparse_values(coverage,rdt,stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements)) if gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY>0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)
def populate_qc_tables(self): svm = StoredValueManager(self.container) svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':-2., 'grt_max_value':40.}) svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4}) svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc': 0.1, 'spike_n':5, 'spike_l':5})
class TransformPrime(TransformDataProcess): binding = ['output'] ''' Transforms which have an incoming stream and an outgoing stream. Parameters: process.stream_id Outgoing stream identifier. process.exchange_point Route's exchange point. process.routing_key Route's routing key. process.queue_name Name of the queue to listen on. process.routes streams,actor for each route {(stream_input_id, stream_output_id):actor} Either the stream_id or both the exchange_point and routing_key need to be provided. ''' def on_start(self): TransformDataProcess.on_start(self) self.pubsub_management = PubsubManagementServiceProcessClient( process=self) self.stored_values = StoredValueManager(self.container) self.input_data_product_ids = self.CFG.get_safe( 'process.input_products', []) self.output_data_product_ids = self.CFG.get_safe( 'process.output_products', []) self.lookup_docs = self.CFG.get_safe('process.lookup_docs', []) self.new_lookups = Queue() self.lookup_monitor = EventSubscriber( event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.lookup_monitor.start() def on_quit(self): self.lookup_monitor.stop() TransformDataProcess.on_quit(self) def _add_lookups(self, event, *args, **kwargs): if event.origin in self.input_data_product_ids + self.output_data_product_ids: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) @memoize_lru(100) def read_stream_def(self, stream_id): return self.pubsub_management.read_stream_definition( stream_id=stream_id) def recv_packet(self, msg, stream_route, stream_id): process_routes = self.CFG.get_safe('process.routes', {}) for stream_in_id, routes in process_routes.iteritems(): if stream_id == stream_in_id: for stream_out_id, actor in routes.iteritems(): if actor is None: rdt_out = self._execute_transform( msg, (stream_in_id, stream_out_id)) self.publish(rdt_out.to_granule(), stream_out_id) else: outgoing = self._execute_actor( msg, actor, (stream_in_id, stream_out_id)) self.publish(outgoing, stream_out_id) def publish(self, msg, stream_out_id): publisher = getattr(self, stream_out_id) publisher.publish(msg) def _load_actor(self, actor): ''' Returns callable execute method if it exists, otherwise it raises a BadRequest ''' try: module = __import__(actor['module'], fromlist=['']) except ImportError: log.exception('Actor could not be loaded') raise try: cls = getattr(module, actor['class']) except AttributeError: log.exception('Module %s does not have class %s', repr(module), actor['class']) raise try: execute = getattr(cls, 'execute') except AttributeError: log.exception('Actor class does not contain execute method') raise return execute def _execute_actor(self, msg, actor, streams): stream_in_id, stream_out_id = streams stream_def_out = self.read_stream_def(stream_out_id) params = self.CFG.get_safe('process.params', {}) config = self.CFG.get_safe('process') #do the stuff with the actor params['stream_def'] = stream_def_out._id executor = self._load_actor(actor) try: rdt_out = executor(msg, None, config, params, None) except: log.exception('Error running actor for %s', self.id) raise return rdt_out def _merge_pdicts(self, pdict1, pdict2): incoming_pdict = ParameterDictionary.load(pdict1) outgoing_pdict = ParameterDictionary.load(pdict2) merged_pdict = ParameterDictionary() for k, v in incoming_pdict.iteritems(): ordinal, v = v if k not in merged_pdict: merged_pdict.add_context(v) for k, v in outgoing_pdict.iteritems(): ordinal, v = v if k not in merged_pdict: merged_pdict.add_context(v) return merged_pdict def _merge_rdt(self, stream_def_in, stream_def_out): incoming_pdict_dump = stream_def_in.parameter_dictionary outgoing_pdict_dump = stream_def_out.parameter_dictionary merged_pdict = self._merge_pdicts(incoming_pdict_dump, outgoing_pdict_dump) rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict) return rdt_temp def _get_lookup_value(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_values.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def _execute_transform(self, msg, streams): stream_in_id, stream_out_id = streams stream_def_in = self.read_stream_def(stream_in_id) stream_def_out = self.read_stream_def(stream_out_id) rdt_temp = self._merge_rdt(stream_def_in, stream_def_out) rdt_in = RecordDictionaryTool.load_from_granule(msg) for field in rdt_temp.fields: if not isinstance( rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType): try: rdt_temp[field] = rdt_in[field] except KeyError: pass rdt_temp.fetch_lookup_values() for lookup_field in rdt_temp.lookup_values(): s = lookup_field stored_value = self._get_lookup_value( rdt_temp.context(s).lookup_value) if stored_value is not None: rdt_temp[s] = stored_value for field in rdt_temp.fields: if isinstance( rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType): rdt_temp[field] = rdt_temp[field] rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id) for field in rdt_out.fields: rdt_out[field] = rdt_temp[field] return rdt_out
def on_start(self): TransformStreamListener.on_start(self) self.document_key = self.CFG.get_safe('process.document_key') self.stored_value_manager = StoredValueManager(self.container)
def test_lookup_values_ingest_replay(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsub_management.create_stream_definition('lookups', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream, stream_id) ingestion_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) config = DotDict() config.process.lookup_docs = ['test1', 'test2'] self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config) self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id) stored_value_manager = StoredValueManager(self.container) stored_value_manager.stored_value_cas('test1',{'offset_a':10.0, 'offset_b':13.1}) publisher = StandaloneStreamPublisher(stream_id, route) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20)) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20,40) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor.event.clear() stored_value_manager.stored_value_cas('test1',{'offset_a':20.0}) stored_value_manager.stored_value_cas('coefficient_document',{'offset_b':10.0}) gevent.sleep(2) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20 + [40.]*20)) np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker): CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5) def __init__(self, *args,**kwargs): TransformStreamListener.__init__(self, *args, **kwargs) BaseIngestionWorker.__init__(self, *args, **kwargs) #-------------------------------------------------------------------------------- # Ingestion Cache # - Datasets # - Coverage instances #-------------------------------------------------------------------------------- self._datasets = collections.OrderedDict() self._coverages = collections.OrderedDict() self._bad_coverages = {} self.time_stats = Accumulator(format='%3f') # unique ID to identify this worker in log msgs self._id = uuid.uuid1() def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True) self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True) if not self.ignore_gaps: log.warning("Gap handling is not supported in release 2") self.ignore_gaps = True self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent) self.connection_id = '' self.connection_index = None self.start_listener() def on_quit(self): #pragma no cover self.event_publisher.close() self.qc_publisher.close() if self.subscriber_thread: self.stop_listener() for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() TransformStreamListener.on_quit(self) BaseIngestionWorker.on_quit(self) def start_listener(self): # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering with self.thread_lock: self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id) def stop_listener(self): # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one) with self.thread_lock: self.subscriber.close() self.subscriber_thread.join(timeout=10) for stream, coverage in self._coverages.iteritems(): try: coverage.close(timeout=5) except: log.exception('Problems closing the coverage') self._coverages.clear() self.subscriber_thread = None def pause(self): if self.subscriber_thread is not None: self.stop_listener() def resume(self): if self.subscriber_thread is None: self.start_listener() def _add_lookups(self, event, *args, **kwargs): if event.origin == self.input_product: if isinstance(event.reference_keys, list): self.new_lookups.put(event.reference_keys) def _new_dataset(self, stream_id): ''' Adds a new dataset to the internal cache of the ingestion worker ''' rr_client = self.container.resource_registry datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True) if datasets: return datasets[0] return None def _get_data_products(self, dataset_id): rr_client = self.container.resource_registry data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False) return data_products def initialize_metadata(self, dataset_id, rdt): ''' Initializes a metadata document in the object store. The document contains information about the bounds and extents of the dataset as well other metadata to improve performance. ''' object_store = self.container.object_store key = dataset_id bounds = {} extents = {} last_values = {} rough_size = 0 for k,v in rdt.iteritems(): v = v[:].flatten() if v.dtype.char not in ('S', 'O', 'U', 'V'): bounds[k] = (np.min(v), np.max(v)) last_values[k] = v[-1] extents[k] = len(rdt) rough_size += len(rdt) * 4 doc = {'bounds':bounds, 'extents':extents, 'last_values':last_values, 'size': rough_size} doc = numpy_walk(doc) object_store.create_doc(doc, object_id=key) return def update_metadata(self, dataset_id, rdt): ''' Updates the metada document with the latest information available ''' self.update_data_product_metadata(dataset_id, rdt) # Grab the document object_store = self.container.object_store key = dataset_id try: doc = object_store.read_doc(key) except NotFound: return self.initialize_metadata(dataset_id, rdt) # These are the fields we're interested in bounds = doc['bounds'] extents = doc['extents'] last_values = doc['last_values'] rough_size = doc['size'] for k,v in rdt.iteritems(): if k not in bounds: continue v = v[:].flatten() # Get the numpy representation (dense array). if v.dtype.char not in ('S', 'O', 'U', 'V'): l_min = np.min(v) l_max = np.max(v) o_min, o_max = bounds[k] bounds[k] = (min(l_min, o_min), max(l_max, o_max)) last_values[k] = v[-1] # Update the bounds # Increase the extents extents[k] = extents[k] + len(rdt) # How about the last value? rough_size += len(rdt) * 4 doc['size'] = rough_size # Sanitize it doc = numpy_walk(doc) object_store.update_doc(doc) def update_data_product_metadata(self, dataset_id, rdt): data_products = self._get_data_products(dataset_id) for data_product in data_products: self.update_time(data_product, rdt[rdt.temporal_parameter][:]) self.update_geo(data_product, rdt) self.container.resource_registry.update(data_product) def update_time(self, data_product, t): #TODO: Account for non NTP-based timestamps t_min = np.min(t) t_min -= 2208988800 t_max = np.max(t) t_max -= 2208988800 if not data_product.nominal_datetime.start_datetime: data_product.nominal_datetime.start_datetime = t_min data_product.nominal_datetime.end_datetime = t_max def update_geo(self, data_product, rdt): lat = None lon = None for p in rdt: if rdt._rd[p] is None: continue # TODO: Not an all encompassing list of acceptable names for lat and lon if p.lower() in ('lat', 'latitude', 'y_axis'): lat = np.asscalar(rdt[p][-1]) elif p.lower() in ('lon', 'longitude', 'x_axis'): lon = np.asscalar(rdt[p][-1]) if lat and lon: break if lat and lon: data_product.geospatial_bounds.geospatial_latitude_limit_north = lat data_product.geospatial_bounds.geospatial_latitude_limit_south = lat data_product.geospatial_bounds.geospatial_longitude_limit_east = lon data_product.geospatial_bounds.geospatial_longitude_limit_west = lon def get_dataset(self,stream_id): ''' Memoization (LRU) of _new_dataset ''' try: result = self._datasets.pop(stream_id) except KeyError: result = self._new_dataset(stream_id) if result is None: return None if len(self._datasets) >= self.CACHE_LIMIT: self._datasets.popitem(0) self._datasets[stream_id] = result return result def get_coverage(self, stream_id): ''' Memoization (LRU) of _get_coverage ''' try: result = self._coverages.pop(stream_id) except KeyError: dataset_id = self.get_dataset(stream_id) if dataset_id is None: return None result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a') if result is None: return None if len(self._coverages) >= self.CACHE_LIMIT: k, coverage = self._coverages.popitem(0) coverage.close(timeout=5) self._coverages[stream_id] = result return result def gap_coverage(self,stream_id): try: old_cov = self._coverages.pop(stream_id) dataset_id = self.get_dataset(stream_id) sdom, tdom = time_series_domain() new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes) old_cov.close() result = new_cov except KeyError: result = self.get_coverage(stream_id) self._coverages[stream_id] = result return result def dataset_changed(self, dataset_id, extents, window): self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window) def evaluate_qc(self, rdt, dataset_id): if self.qc_enabled: for field in rdt.fields: if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')): continue try: values = rdt[field] if values is not None: if not all(values): topology = np.where(values==0) timestamps = rdt[rdt.temporal_parameter][topology[0]] self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {}) except: continue def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration): data_product_ids, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True) for data_product_id in data_product_ids: description = 'Automated Quality Control Alerted on %s' % parameter self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description) def update_connection_index(self, connection_id, connection_index): self.connection_id = connection_id try: connection_index = int(connection_index) self.connection_index = connection_index except ValueError: pass def has_gap(self, connection_id, connection_index): if connection_id: if not self.connection_id: self.update_connection_index(connection_id, connection_index) return False else: if connection_id != self.connection_id: return True if connection_index: if self.connection_index is None: self.update_connection_index(connection_id, connection_index) return False try: connection_index = int(connection_index) if connection_index != self.connection_index+1: return True except ValueError: pass return False def splice_coverage(self, dataset_id, coverage): log.info('Splicing new coverage') DatasetManagementService._splice_coverage(dataset_id, coverage) @handle_stream_exception() def recv_packet(self, msg, stream_route, stream_id): ''' receive packet for ingestion ''' log.trace('received granule for stream %s', stream_id) if msg == {}: log.error('Received empty message from stream: %s', stream_id) return # Message validation if not isinstance(msg, Granule): log.error('Ingestion received a message that is not a granule: %s', msg) return rdt = RecordDictionaryTool.load_from_granule(msg) if rdt is None: log.error('Invalid granule (no RDT) for stream %s', stream_id) return if not len(rdt): log.debug('Empty granule for stream %s', stream_id) return self.persist_or_timeout(stream_id, rdt) def persist_or_timeout(self, stream_id, rdt): """ retry writing coverage multiple times and eventually time out """ done = False timeout = 2 start = time.time() while not done: try: self.add_granule(stream_id, rdt) done = True except: log.exception('An issue with coverage, retrying after a bit') if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up dataset_id = self.get_dataset(stream_id) log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id)) raise if stream_id in self._coverages: log.info('Popping coverage for stream %s', stream_id) self._coverages.pop(stream_id) gevent.sleep(timeout) if timeout > (60 * 5): timeout = 60 * 5 else: timeout *= 2 def expand_coverage(self, coverage, elements, stream_id): try: coverage.insert_timesteps(elements, oob=False) except IOError as e: log.error("Couldn't insert time steps for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def get_stored_values(self, lookup_value): if not self.new_lookups.empty(): new_values = self.new_lookups.get() self.lookup_docs = new_values + self.lookup_docs lookup_value_document_keys = self.lookup_docs for key in lookup_value_document_keys: try: document = self.stored_value_manager.read_value(key) if lookup_value in document: return document[lookup_value] except NotFound: log.warning('Specified lookup document does not exist') return None def fill_lookup_values(self, rdt): rdt.fetch_lookup_values() for field in rdt.lookup_values(): value = self.get_stored_values(rdt.context(field).lookup_value) if value: rdt[field] = value def insert_sparse_values(self, coverage, rdt, stream_id): self.fill_lookup_values(rdt) for field in rdt.fields: if rdt._rd[field] is None: continue if not isinstance(rdt.context(field).param_type, SparseConstantType): # We only set sparse values before insert continue value = rdt[field] try: coverage.set_parameter_values(param_name=field, value=value) except ValueError as e: if "'lower_bound' cannot be >= 'upper_bound'" in e.message: continue else: raise except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) def insert_values(self, coverage, rdt, stream_id): elements = len(rdt) start_index = coverage.num_timesteps - elements for k,v in rdt.iteritems(): if isinstance(v, SparseConstantValue): continue slice_ = slice(start_index, None) try: coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v) except IOError as e: log.error("Couldn't insert values for coverage: %s", coverage.persistence_dir, exc_info=True) try: coverage.close() finally: self._bad_coverages[stream_id] = 1 raise CorruptionError(e.message) except IndexError as e: log.error("Value set: %s", v[:]) data_products, _ = self.container.resource_registry.find_subjects(object=stream_id, predicate=PRED.hasStream, subject_type=RT.DataProduct) for data_product in data_products: log.exception("Index exception with %s, trying to insert %s into coverage with shape %s", data_product.name, k, v.shape) if 'ingestion_timestamp' in coverage.list_parameters(): t_now = time.time() ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now) coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time) def add_granule(self,stream_id, rdt): ''' Appends the granule's data to the coverage and persists it. ''' debugging = log.isEnabledFor(DEBUG) timer = Timer() if debugging else None if stream_id in self._bad_coverages: log.info('Message attempting to be inserted into bad coverage: %s', DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) #-------------------------------------------------------------------------------- # Gap Analysis #-------------------------------------------------------------------------------- if not self.ignore_gaps: gap_found = self.has_gap(rdt.connection_id, rdt.connection_index) if gap_found: log.warning('Gap Found! New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index) self.gap_coverage(stream_id) #-------------------------------------------------------------------------------- # Coverage determiniation and appending #-------------------------------------------------------------------------------- dataset_id = self.get_dataset(stream_id) if not dataset_id: log.error('No dataset could be determined on this stream: %s', stream_id) return try: coverage = self.get_coverage(stream_id) except IOError as e: log.error("Couldn't open coverage: %s", DatasetManagementService._get_coverage_path(self.get_dataset(stream_id))) raise CorruptionError(e.message) if debugging: path = DatasetManagementService._get_coverage_path(dataset_id) log.debug('%s: add_granule stream %s dataset %s coverage %r file %s', self._id, stream_id, dataset_id, coverage, path) if not coverage: log.error('Could not persist coverage from granule, coverage is None') return #-------------------------------------------------------------------------------- # Actual persistence #-------------------------------------------------------------------------------- elements = len(rdt) if rdt[rdt.temporal_parameter] is None: elements = 0 self.insert_sparse_values(coverage,rdt,stream_id) if debugging: timer.complete_step('checks') # lightweight ops, should be zero self.expand_coverage(coverage, elements, stream_id) if debugging: timer.complete_step('insert') self.insert_values(coverage, rdt, stream_id) if debugging: timer.complete_step('keys') DatasetManagementService._save_coverage(coverage) if debugging: timer.complete_step('save') start_index = coverage.num_timesteps - elements if not self.ignore_gaps and gap_found: self.splice_coverage(dataset_id, coverage) self.evaluate_qc(rdt, dataset_id) if debugging: timer.complete_step('notify') self._add_timing_stats(timer) self.update_connection_index(rdt.connection_id, rdt.connection_index) self.update_metadata(dataset_id, rdt) self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements)) def _add_timing_stats(self, timer): """ add stats from latest coverage operation to Accumulator and periodically log results """ self.time_stats.add(timer) if self.time_stats.get_count() % REPORT_FREQUENCY>0: return if log.isEnabledFor(TRACE): # report per step for step in 'checks', 'insert', 'keys', 'save', 'notify': log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step)) # report totals log.debug('%s total times: %s', self._id, self.time_stats)
def test_lookup_values(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsubcli.create_stream_definition('lookup', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsubcli.delete_stream_definition, stream_def_id) data_product = DataProduct(name='lookup data product') tdom, sdom = time_series_domain() data_product.temporal_domain = tdom.dump() data_product.spatial_domain = sdom.dump() data_product_id = self.dpsc_cli.create_data_product(data_product, stream_definition_id=stream_def_id) self.addCleanup(self.dpsc_cli.delete_data_product, data_product_id) data_producer = DataProducer(name='producer') data_producer.producer_context = DataProcessProducerContext() data_producer.producer_context.configuration['qc_keys'] = ['offset_document'] data_producer_id, _ = self.rrclient.create(data_producer) self.addCleanup(self.rrclient.delete, data_producer_id) assoc,_ = self.rrclient.create_association(subject=data_product_id, object=data_producer_id, predicate=PRED.hasDataProducer) self.addCleanup(self.rrclient.delete_association, assoc) document_keys = self.damsclient.list_qc_references(data_product_id) self.assertEquals(document_keys, ['offset_document']) svm = StoredValueManager(self.container) svm.stored_value_cas('offset_document', {'offset_a':2.0}) self.dpsc_cli.activate_data_product_persistence(data_product_id) dataset_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasDataset, id_only=True) dataset_id = dataset_ids[0] dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [0] rdt['temp'] = [20.] granule = rdt.to_granule() stream_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasStream, id_only=True) stream_id = stream_ids[0] route = self.pubsubcli.read_stream_route(stream_id=stream_id) publisher = StandaloneStreamPublisher(stream_id, route) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(10)) granule = self.data_retriever.retrieve(dataset_id) rdt2 = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt['temp'], rdt2['temp']) np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0])) svm.stored_value_cas('updated_document', {'offset_a':3.0}) dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) ep = EventPublisher(event_type=OT.ExternalReferencesUpdatedEvent) ep.publish_event(origin=data_product_id, reference_keys=['updated_document']) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [1] rdt['temp'] = [20.] granule = rdt.to_granule() gevent.sleep(2) # Yield so that the event goes through publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(10)) granule = self.data_retriever.retrieve(dataset_id) rdt2 = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt2['temp'],np.array([20.,20.])) np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0,23.0]))