def test_qc_attachment(self):
        instrument_device = InstrumentDevice(name='whatever')
        instrument_device_id,_ = self.rrclient.create(instrument_device)
        self.addCleanup(self.rrclient.delete, instrument_device_id)
        self.client.register_instrument(instrument_device_id)
        self.addCleanup(self.client.unregister_instrument, instrument_device_id)
        dp = DataProduct(name='instrument output')

        dp_id,_ = self.rrclient.create(dp)
        self.addCleanup(self.rrclient.delete, dp_id)

        parser_id = self.make_grt_parser()
        attachment = Attachment(name='qc ref', attachment_type=AttachmentType.REFERENCE,content=global_range_test_document, context=ReferenceAttachmentContext(parser_id=parser_id))
        att_id = self.rrclient.create_attachment(dp_id, attachment)
        self.addCleanup(self.rrclient.delete_attachment, att_id)

        attachment2 = Attachment(name='qc ref2', attachment_type=AttachmentType.REFERENCE, content=global_range_test_document2, context=ReferenceAttachmentContext(parser_id=parser_id))
        att2_id = self.rrclient.create_attachment(dp_id, attachment2)
        self.addCleanup(self.rrclient.delete_attachment, att2_id)

        self.client.assign_data_product(instrument_device_id, dp_id)
        self.addCleanup(self.client.unassign_data_product, instrument_device_id, dp_id)
        svm = StoredValueManager(self.container)
        doc = svm.read_value('grt_CE01ISSM-MF005-01-CTDBPC999_TEMPWAT')
        np.testing.assert_array_almost_equal(doc['grt_min_value'], -2.)
예제 #2
0
    def fetch_lookup_values(self):
        doc_keys = []
        for lv in self._lookup_values():
            context = self.context(lv)
            if context.document_key:
                document_key = context.document_key
                if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                    document_key = document_key.replace(
                        '$designator',
                        self._stream_config['reference_designator'])
                doc_keys.append(document_key)

        lookup_docs = {}
        if doc_keys:
            svm = StoredValueManager(Container.instance)
            doc_list = svm.read_value_mult(doc_keys)
            lookup_docs = dict(zip(doc_keys, doc_list))

        for lv in self._lookup_values():
            context = self.context(lv)
            if context.document_key:
                document_key = context.document_key
                if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                    document_key = document_key.replace(
                        '$designator',
                        self._stream_config['reference_designator'])
                doc = lookup_docs[document_key]
                if doc is None:
                    log.debug('Reference Document for %s not found',
                              document_key)
                    continue
                if context.lookup_value in doc:
                    self[lv] = [doc[context.lookup_value]
                                ] * self._shp[0] if self._shp else doc[
                                    context.lookup_value]
    def fetch_lookup_values(self):
        doc_keys = []
        for lv in self._lookup_values():
            context = self.context(lv)
            if context.document_key:
                document_key = context.document_key
                if "$designator" in context.document_key and "reference_designator" in self._stream_config:
                    document_key = document_key.replace("$designator", self._stream_config["reference_designator"])
                doc_keys.append(document_key)

        lookup_docs = {}
        if doc_keys:
            svm = StoredValueManager(Container.instance)
            doc_list = svm.read_value_mult(doc_keys)
            lookup_docs = dict(zip(doc_keys, doc_list))

        for lv in self._lookup_values():
            context = self.context(lv)
            if context.document_key:
                document_key = context.document_key
                if "$designator" in context.document_key and "reference_designator" in self._stream_config:
                    document_key = document_key.replace("$designator", self._stream_config["reference_designator"])
                doc = lookup_docs[document_key]
                if doc is None:
                    log.debug("Reference Document for %s not found", document_key)
                    continue
                if context.lookup_value in doc:
                    self[lv] = [doc[context.lookup_value]] * self._shp[0] if self._shp else doc[context.lookup_value]
class StoredValueTransform(TransformStreamListener):
    '''
    Receives granules from a stream and persists the latest value in the object store

    Background:
        Platforms publish geospatial information on a separate stream from the 
        instruments. Various data products require geospatial information about
        the instrument to calculate the variables. This component persists the
        latest value in a simple data storage container where complex data
        containers can access it.
    '''
        
    def on_start(self):
        TransformStreamListener.on_start(self)
        self.document_key = self.CFG.get_safe('process.document_key')
        self.stored_value_manager = StoredValueManager(self.container)

    def recv_packet(self, msg, route, stream_id):
        rdt = RecordDictionaryTool.load_from_granule(msg)

        document = {}

        for k,v in rdt.iteritems():
            value_array = np.atleast_1d(v[:])
            if 'f' in value_array.dtype.str:
                document[k] = float(value_array[-1])
            elif 'i' in value_array.dtype.str:
                document[k] = int(value_array[-1])

        self.stored_value_manager.stored_value_cas(self.document_key, document)
예제 #5
0
class StoredValueTransform(TransformStreamListener):
    '''
    Receives granules from a stream and persists the latest value in the object store

    Background:
        Platforms publish geospatial information on a separate stream from the 
        instruments. Various data products require geospatial information about
        the instrument to calculate the variables. This component persists the
        latest value in a simple data storage container where complex data
        containers can access it.
    '''
    def on_start(self):
        TransformStreamListener.on_start(self)
        self.document_key = self.CFG.get_safe('process.document_key')
        self.stored_value_manager = StoredValueManager(self.container)

    def recv_packet(self, msg, route, stream_id):
        rdt = RecordDictionaryTool.load_from_granule(msg)

        document = {}

        for k, v in rdt.iteritems():
            value_array = np.atleast_1d(v[:])
            if 'f' in value_array.dtype.str:
                document[k] = float(value_array[-1])
            elif 'i' in value_array.dtype.str:
                document[k] = int(value_array[-1])

        self.stored_value_manager.stored_value_cas(self.document_key, document)
예제 #6
0
 def parse_document(cls, container, parser, document_path):
     svm = StoredValueManager(container)
     document = ''
     with open(document_path,'r') as f:
         document = f.read()
     for k,v in parser(document):
         svm.stored_value_cas(k,v)
     return
예제 #7
0
 def parse_document(cls, container, parser, document_path):
     svm = StoredValueManager(container)
     document = ''
     with open(document_path, 'r') as f:
         document = f.read()
     for k, v in parser(document):
         svm.stored_value_cas(k, v)
     return
예제 #8
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path,'r') as f:
            document = f.read()
        for k,v in parser(document):
            svm.stored_value_cas(k,v)
        return


    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value('grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value('spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN'))


    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck'])

        ret_doc = self.svm.read_value('svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value('trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0)

    
    def test_lrt_parser(self):
        self.parse_document(self.container, lrt_parser, qc_paths['lrt'])

        ret_doc = self.svm.read_value('lrt_%s_%s' %('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL'))

        np.testing.assert_array_equal(ret_doc['datlim'][0], np.array([32.289, 32.927]))
        self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
    def test_global_range_test_parser(self):
        svm = StoredValueManager(self.container)
        for key,doc in grt_parser(grt_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('grt_sbe37-abc123_TEMPWAT_UHHH')
        self.assertEquals(doc['grt_min_value'], 0.)
        self.assertEquals(doc['array'],'array 1')
        doc = svm.read_value('grt_sbe37-abc123_PRESWAT_flagged')
        self.assertEquals(doc['grt_max_value'], 689.47)
예제 #10
0
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(context.document_key)
             except NotFound:
                 continue
             if context.lookup_value in doc:
                 self[lv] = doc[context.lookup_value]
예제 #11
0
    def test_stuck_value_test(self):
        svm = StoredValueManager(self.container)
        for key, doc in trend_parser(trend_value_test_sample_doc):
            svm.stored_value_cas(key, doc)
            self.addCleanup(svm.delete_stored_value, key)

        doc = svm.read_value("trend_ssxbt-ssn719_PRESSURE")
        self.assertEquals(doc["time_interval"], 1.0)
        self.assertEquals(doc["standard_deviation"], 0.0)

        doc = svm.read_value("trend_ssxbt-ssn719_COND")
        self.assertEquals(doc["time_interval"], 3.0)
        self.assertEquals(doc["polynomial_order"], "third")
    def test_gradient_test(self):
        svm = StoredValueManager(self.container)
        for key,doc in gradient_test_parser(gradient_test_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('grad_CHEL-C-32-12_PRESSURE_DENSITY')
        self.assertEquals(doc['units_dat'], 'dbar')
        self.assertEquals(doc['d_dat_dx'], 2.781)

        doc = svm.read_value('grad_CHEW-BA-C-A12_PRESSURE_DENSITY')
        self.assertEquals(doc['units_x'], 'kg m-3')
        self.assertEquals(doc['tol_dat'], 12.)
    def test_stuck_value_test(self):
        svm = StoredValueManager(self.container)
        for key,doc in stuck_value_test_parser(stuck_value_test_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('svt_ssxbt-ssn719_PRESSURE')
        self.assertEquals(doc['svt_resolution'], 1.2)
        self.assertEquals(doc['svt_n'], 10.)

        doc = svm.read_value('svt_ssxbt-ssn719_COND')
        self.assertEquals(doc['svt_resolution'], 0.2)
        self.assertEquals(doc['units'], 'S m-1')
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             document_key = context.document_key
             if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                 document_key = document_key.replace('$designator',self._stream_config['reference_designator'])
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(document_key)
             except NotFound:
                 log.debug('Reference Document for %s not found', document_key)
                 continue
             if context.lookup_value in doc:
                 self[lv] = [doc[context.lookup_value]] * self._shp[0] if self._shp else doc[context.lookup_value]
예제 #15
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path,'r') as f:
            document = f.read()
        for k,v in parser(document):
            svm.stored_value_cas(k,v)
        return


    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value('grt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value('spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'DENSITY'))


    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck'])

        ret_doc = self.svm.read_value('svt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 0.0001)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 20.)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value('trend_%s_%s' % ('RS01SBVM-LJ01A-05-HPIESA101', 'IESPRES'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 90.)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 5.0)
예제 #16
0
    def on_start(self): #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)
        
        self.queue_name = self.CFG.get_safe('process.queue_name',self.id)
        self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback)
        self.thread_lock = RLock()
        
        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.connection_id = ''
        self.connection_index = None
        
        self.start_listener()
예제 #17
0
 def on_start(self):
     TransformDataProcess.on_start(self)
     self.pubsub_management = PubsubManagementServiceProcessClient(
         process=self)
     self.stored_values = StoredValueManager(self.container)
     self.input_data_product_ids = self.CFG.get_safe(
         'process.input_products', [])
     self.output_data_product_ids = self.CFG.get_safe(
         'process.output_products', [])
     self.lookup_docs = self.CFG.get_safe('process.lookup_docs', [])
     self.new_lookups = Queue()
     self.lookup_monitor = EventSubscriber(
         event_type=OT.ExternalReferencesUpdatedEvent,
         callback=self._add_lookups,
         auto_delete=True)
     self.lookup_monitor.start()
예제 #18
0
    def test_rdt_lookup(self):
        rdt = self.create_lookup_rdt()

        self.assertTrue('offset_a' in rdt.lookup_values())
        self.assertFalse('offset_b' in rdt.lookup_values())

        rdt['time'] = [0]
        rdt['temp'] = [10.0]
        rdt['offset_a'] = [2.0]
        self.assertEquals(rdt['offset_b'], None)
        self.assertEquals(rdt.lookup_values(), ['offset_a'])
        np.testing.assert_array_almost_equal(rdt['calibrated'], np.array([12.0]))

        svm = StoredValueManager(self.container)
        svm.stored_value_cas('coefficient_document', {'offset_b':2.0})
        rdt.fetch_lookup_values()
        np.testing.assert_array_equal(rdt['offset_b'], np.array([2.0]))
        np.testing.assert_array_equal(rdt['calibrated_b'], np.array([14.0]))
예제 #19
0
    def setUp(self):
        DMTestCase.setUp(self)
        self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = self.ph.create_simple_qc_pdict()

        self.stream_def_id = self.pubsub_management.create_stream_definition('global range', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':'QCTEST'})
        self.addCleanup(self.pubsub_management.delete_stream_definition, self.stream_def_id)

        self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id)
        self.svm = StoredValueManager(self.container)
예제 #20
0
 def on_start(self):
     TransformDataProcess.on_start(self)
     self.pubsub_management = PubsubManagementServiceProcessClient(process=self)
     self.stored_values = StoredValueManager(self.container)
     self.input_data_product_ids = self.CFG.get_safe('process.input_products', [])
     self.output_data_product_ids = self.CFG.get_safe('process.output_products', [])
     self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
     self.new_lookups = Queue()
     self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent,callback=self._add_lookups, auto_delete=True)
     self.lookup_monitor.start()
예제 #21
0
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             document_key = context.document_key
             if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                 document_key = document_key.replace(
                     '$designator',
                     self._stream_config['reference_designator'])
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(document_key)
             except NotFound:
                 log.debug('Reference Document for %s not found',
                           document_key)
                 continue
             if context.lookup_value in doc:
                 self[lv] = [doc[context.lookup_value]
                             ] * self._shp[0] if self._shp else doc[
                                 context.lookup_value]
예제 #22
0
    def test_global_range_lookup(self):
        reference_designator = "CE01ISSM-MF005-01-CTDBPC999"
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_simple_qc_pdict()
        svm = StoredValueManager(self.container)
        doc_key = 'grt_%s_TEMPWAT' % reference_designator
        svm.stored_value_cas(doc_key, {'grt_min_value':-2, 'grt_max_value':40})
        
        stream_def_id = self.pubsub_management.create_stream_definition('qc parsed', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':reference_designator})
        self.addCleanup(self.pubsub_management.delete_stream_definition,stream_def_id)
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [0]
        rdt['temp'] = [20]
        rdt.fetch_lookup_values()
        min_field = [i for i in rdt.fields if 'grt_min_value' in i][0]
        max_field = [i for i in rdt.fields if 'grt_max_value' in i][0]

        np.testing.assert_array_almost_equal(rdt[min_field], [-2.])
        np.testing.assert_array_almost_equal(rdt[max_field], [40.])

        np.testing.assert_array_almost_equal(rdt['tempwat_glblrng_qc'],[1])
예제 #23
0
 def populate_qc_tables(self):
     svm = StoredValueManager(self.container)
     svm.stored_value_cas('grt_QCTEST_TEMPWAT', {
         'grt_min_value': -2.,
         'grt_max_value': 40.
     })
     svm.stored_value_cas('svt_QCTEST_TEMPWAT', {
         'svt_resolution': 0.001,
         'svt_n': 4
     })
     svm.stored_value_cas('spike_QCTEST_TEMPWAT', {
         'acc': 0.1,
         'spike_n': 5,
         'spike_l': 5
     })
    def on_start(self): #pragma no cover
        super(ScienceGranuleIngestionWorker,self).on_start()
        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.lookup_monitor.start()
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None
    def test_basic_stored_docs(self):

        doc = {'key':'value'}
        doc_key = 'doc_id'

        svm = StoredValueManager(self.container)
        doc_id, rev = svm.stored_value_cas(doc_key, doc)
        self.addCleanup(svm.delete_stored_value, doc_key)

        doc = svm.read_value(doc_key)
        self.assertTrue('key' in doc and doc['key']=='value')
        svm.stored_value_cas(doc_key,{'key2':'value2'})
        doc = svm.read_value(doc_key)
        self.assertTrue('key' in doc and doc['key']=='value')
        self.assertTrue('key2' in doc and doc['key2']=='value2')
    def on_start(self): #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)
        
        self.queue_name = self.CFG.get_safe('process.queue_name',self.id)
        self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback)
        self.thread_lock = RLock()
        
        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True)
        if not self.ignore_gaps:
            log.warning("Gap handling is not supported in release 2")
        self.ignore_gaps = True
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None
        
        self.start_listener()
예제 #27
0
    def test_rdt_lookup(self):
        rdt = self.create_lookup_rdt()

        self.assertTrue('offset_a' in rdt.lookup_values())
        self.assertFalse('offset_b' in rdt.lookup_values())

        rdt['time'] = [0]
        rdt['temp'] = [10.0]
        rdt['offset_a'] = [2.0]
        self.assertEquals(rdt['offset_b'], None)
        self.assertEquals(rdt.lookup_values(), ['offset_a'])
        np.testing.assert_array_almost_equal(rdt['calibrated'], np.array([12.0]))

        svm = StoredValueManager(self.container)
        svm.stored_value_cas('coefficient_document', {'offset_b':2.0})
        svm.stored_value_cas("GA03FLMA-RI001-13-CTDMOG999_OFFSETC", {'offset_c':3.0})
        rdt.fetch_lookup_values()
        np.testing.assert_array_equal(rdt['offset_b'], np.array([2.0]))
        np.testing.assert_array_equal(rdt['calibrated_b'], np.array([14.0]))
        np.testing.assert_array_equal(rdt['offset_c'], np.array([3.0]))
예제 #28
0
class TestQCFunctions(DMTestCase):
    def setUp(self):
        DMTestCase.setUp(self)
        self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = self.ph.create_simple_qc_pdict()

        self.stream_def_id = self.pubsub_management.create_stream_definition('global range', parameter_dictionary_id=pdict_id, stream_configuration={'reference_designator':'QCTEST'})
        self.addCleanup(self.pubsub_management.delete_stream_definition, self.stream_def_id)

        self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id)
        self.svm = StoredValueManager(self.container)

    def test_global_range_test(self):
        self.svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':10., 'grt_max_value':20.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25]
        self.rdt.fetch_lookup_values()
        np.testing.assert_array_almost_equal(self.rdt['tempwat_glblrng_qc'], [0, 1, 1, 1, 1, 1, 1, 0])

    def test_spike_test(self): # I know how redundant this sounds
        self.svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc':0.1, 'spike_n':5., 'spike_l':5.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [-1, 3, 40, -1, 1, -6, -6, 1]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_spketst_qc'], [1, 1, 0, 1, 1, 1, 1, 1])

    def test_stuck_value_test(self):
        self.svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4.})

        self.rdt['time'] = np.arange(10)
        self.rdt['temp'] = [4.83, 1.40, 3.33, 3.33, 3.33, 3.33, 4.09, 2.97, 2.85, 3.67]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_stuckvl_qc'], [1, 1, 0, 0, 0, 0, 1, 1, 1, 1])
예제 #29
0
 def setUp(self):
     self._start_container()
     self.svm = StoredValueManager(self.container)
예제 #30
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path, 'r') as f:
            document = f.read()
        for k, v in parser(document):
            svm.stored_value_cas(k, v)
        return

    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value(
            'grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value(
            'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(
            NotFound, self.svm.read_value,
            'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN'))

    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser,
                            qc_paths['stuck'])

        ret_doc = self.svm.read_value(
            'svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value(
            'trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0)

    def test_lrt_parser(self):
        self.parse_document(self.container, lrt_parser, qc_paths['lrt'])

        ret_doc = self.svm.read_value(
            'lrt_%s_%s' % ('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL'))

        np.testing.assert_array_equal(ret_doc['datlim'][0],
                                      np.array([32.289, 32.927]))
        self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
예제 #31
0
 def setUp(self):
     self._start_container()
     self.svm = StoredValueManager(self.container)
    def test_lookup_values(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsubcli.create_stream_definition(
            'lookup', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition, stream_def_id)

        data_product = DataProduct(name='lookup data product')
        tdom, sdom = time_series_domain()
        data_product.temporal_domain = tdom.dump()
        data_product.spatial_domain = sdom.dump()

        data_product_id = self.dpsc_cli.create_data_product(
            data_product, stream_definition_id=stream_def_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, data_product_id)
        data_producer = DataProducer(name='producer')
        data_producer.producer_context = DataProcessProducerContext()
        data_producer.producer_context.configuration['qc_keys'] = [
            'offset_document'
        ]
        data_producer_id, _ = self.rrclient.create(data_producer)
        self.addCleanup(self.rrclient.delete, data_producer_id)
        assoc, _ = self.rrclient.create_association(
            subject=data_product_id,
            object=data_producer_id,
            predicate=PRED.hasDataProducer)
        self.addCleanup(self.rrclient.delete_association, assoc)

        document_keys = self.damsclient.list_qc_references(data_product_id)

        self.assertEquals(document_keys, ['offset_document'])
        svm = StoredValueManager(self.container)
        svm.stored_value_cas('offset_document', {'offset_a': 2.0})
        self.dpsc_cli.activate_data_product_persistence(data_product_id)
        dataset_ids, _ = self.rrclient.find_objects(subject=data_product_id,
                                                    predicate=PRED.hasDataset,
                                                    id_only=True)
        dataset_id = dataset_ids[0]

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [0]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()

        stream_ids, _ = self.rrclient.find_objects(subject=data_product_id,
                                                   predicate=PRED.hasStream,
                                                   id_only=True)
        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        publisher = StandaloneStreamPublisher(stream_id, route)
        publisher.publish(granule)

        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['temp'], rdt2['temp'])
        np.testing.assert_array_almost_equal(rdt2['calibrated'],
                                             np.array([22.0]))

        svm.stored_value_cas('updated_document', {'offset_a': 3.0})
        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)
        ep = EventPublisher(event_type=OT.ExternalReferencesUpdatedEvent)
        ep.publish_event(origin=data_product_id,
                         reference_keys=['updated_document'])

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [1]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()
        gevent.sleep(2)  # Yield so that the event goes through
        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt2['temp'], np.array([20., 20.]))
        np.testing.assert_array_almost_equal(rdt2['calibrated'],
                                             np.array([22.0, 23.0]))
 def setUp(self):
     DMTestCase.setUp(self)
     self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
     self.pdict_id = self.ph.create_simple_qc_pdict()
     self.svm = StoredValueManager(self.container)
class ScienceGranuleIngestionWorker(TransformStreamListener,
                                    BaseIngestionWorker):
    CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5)

    def __init__(self, *args, **kwargs):
        TransformStreamListener.__init__(self, *args, **kwargs)
        BaseIngestionWorker.__init__(self, *args, **kwargs)

        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()

    def on_start(self):  #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)

        self.queue_name = self.CFG.get_safe('process.queue_name', self.id)
        self.subscriber = StreamSubscriber(process=self,
                                           exchange_name=self.queue_name,
                                           callback=self.receive_callback)
        self.thread_lock = RLock()

        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(
            from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs', [])
        self.input_product = self.CFG.get_safe('process.input_product', '')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps',
                                             False)
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(
            event_type=OT.ExternalReferencesUpdatedEvent,
            callback=self._add_lookups,
            auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None

        self.start_listener()

    def on_quit(self):  #pragma no cover
        self.event_publisher.close()
        self.qc_publisher.close()
        if self.subscriber_thread:
            self.stop_listener()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
        self._coverages.clear()
        TransformStreamListener.on_quit(self)
        BaseIngestionWorker.on_quit(self)

    def start_listener(self):
        # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering
        with self.thread_lock:
            self.subscriber_thread = self._process.thread_manager.spawn(
                self.subscriber.listen, thread_name='%s-subscriber' % self.id)

    def stop_listener(self):
        # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one)
        with self.thread_lock:
            self.subscriber.close()
            self.subscriber_thread.join(timeout=10)
            for stream, coverage in self._coverages.iteritems():
                try:
                    coverage.close(timeout=5)
                except:
                    log.exception('Problems closing the coverage')
            self._coverages.clear()
            self.subscriber_thread = None

    def pause(self):
        if self.subscriber_thread is not None:
            self.stop_listener()

    def resume(self):
        if self.subscriber_thread is None:
            self.start_listener()

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = ResourceRegistryServiceClient()
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,
                                              predicate=PRED.hasStream,
                                              object=stream_id,
                                              id_only=True)
        if datasets:
            return datasets[0]
        return None

    def get_dataset(self, stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id,
                                                                    mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self, stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(
                dataset_id, old_cov.parameter_dictionary, sdom, tdom,
                old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result

    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id,
                                           author=self.id,
                                           extents=extents,
                                           window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not (field.endswith('glblrng_qc')
                        or field.endswith('loclrng_qc')):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.where(values == 0)
                            timestamps = rdt[rdt.temporal_parameter][
                                topology[0]]
                            self.flag_qc_parameter(dataset_id, field,
                                                   timestamps.tolist(), {})
                except:
                    continue

    def flag_qc_parameter(self, dataset_id, parameter, temporal_values,
                          configuration):
        data_product_ids, _ = self.container.resource_registry.find_subjects(
            object=dataset_id,
            predicate=PRED.hasDataset,
            subject_type=RT.DataProduct,
            id_only=True)
        for data_product_id in data_product_ids:
            description = 'Automated Quality Control Alerted on %s' % parameter
            self.qc_publisher.publish_event(origin=data_product_id,
                                            qc_parameter=parameter,
                                            temporal_values=temporal_values,
                                            configuration=configuration,
                                            description=description)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index + 1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s',
                      msg)
            return

        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() -
                        start) > MAX_RETRY_TIME:  # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error(
                        "We're giving up, the coverage needs to be inspected %s",
                        DatasetManagementService._get_coverage_path(
                            dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2

    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir,
                      exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)

    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None

    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt.fields:
            if rdt[field] is None:
                continue
            if not isinstance(
                    rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except ValueError as e:
                if "'lower_bound' cannot be >= 'upper_bound'" in e.message:
                    continue
                else:
                    raise
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir,
                          exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k, v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k,
                                              tdoa=slice_,
                                              value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir,
                          exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(
                coverage.get_parameter_context('ingestion_timestamp').uom,
                t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp',
                                          tdoa=slice_,
                                          value=ntp_time)

    def add_granule(self, stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info(
                'Message attempting to be inserted into bad coverage: %s',
                DatasetManagementService._get_coverage_path(
                    self.get_dataset(stream_id)))

        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        if not self.ignore_gaps:
            gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
            if gap_found:
                log.error(
                    'Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)',
                    rdt.connection_id, rdt.connection_index,
                    self.connection_id, self.connection_index)
                self.gap_coverage(stream_id)

        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s',
                      stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error(
                "Couldn't open coverage: %s",
                DatasetManagementService._get_coverage_path(
                    self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug(
                '%s: add_granule stream %s dataset %s coverage %r file %s',
                self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error(
                'Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)
        if rdt[rdt.temporal_parameter] is None:
            elements = 0

        self.insert_sparse_values(coverage, rdt, stream_id)

        if debugging:
            timer.complete_step('checks')  # lightweight ops, should be zero

        self.expand_coverage(coverage, elements, stream_id)

        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)

        if debugging:
            timer.complete_step('keys')

        DatasetManagementService._save_coverage(coverage)

        if debugging:
            timer.complete_step('save')

        start_index = coverage.num_timesteps - elements
        self.dataset_changed(dataset_id, coverage.num_timesteps,
                             (start_index, start_index + elements))

        if not self.ignore_gaps and gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)

        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY > 0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step,
                          self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
예제 #35
0
class TestQCFunctions(DMTestCase):
    def setUp(self):
        DMTestCase.setUp(self)
        self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
        self.pdict_id = self.ph.create_simple_qc_pdict()
        self.svm = StoredValueManager(self.container)

    def new_rdt(self,ref='QCTEST'):
        self.stream_def_id = self.create_stream_definition(uuid4().hex, parameter_dictionary_id=self.pdict_id, stream_configuration={'reference_designator':'QCTEST'})
        self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id)

    def test_qc_functions(self):
        self.check_global_range()
        self.check_spike()
        self.check_stuck_value()
        self.check_trend()
        self.check_gradient()
        self.check_localrange()
        self.check_propagate()


    def check_global_range(self):
        log.info('check_global_range')
        self.new_rdt()
        self.svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':10., 'grt_max_value':20.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25]
        self.rdt.fetch_lookup_values()
        np.testing.assert_array_almost_equal(self.rdt['tempwat_glblrng_qc'], [0, 1, 1, 1, 1, 1, 1, 0])

    def check_spike(self): 
        log.info('check_spike')
        self.new_rdt()
        self.svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc':0.1, 'spike_n':5., 'spike_l':5.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [-1, 3, 40, -1, 1, -6, -6, 1]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_spketst_qc'], [1, 1, 0, 1, 1, 1, 1, 1])

    def check_stuck_value(self):
        log.info('check_stuck_value')
        self.new_rdt()
        self.svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4.})

        self.rdt['time'] = np.arange(10)
        self.rdt['temp'] = [4.83, 1.40, 3.33, 3.33, 3.33, 3.33, 4.09, 2.97, 2.85, 3.67]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_stuckvl_qc'], [1, 1, 0, 0, 0, 0, 1, 1, 1, 1])

    def check_trend(self):
        log.info('check_trend')
        self.new_rdt()
        self.svm.stored_value_cas('trend_QCTEST_TEMPWAT', {'time_interval':0, 'polynomial_order': 1, 'standard_deviation': 3})
        self.rdt['time'] = np.arange(10)
        self.rdt['temp'] = [0.8147, 0.9058, 0.1270, 0.9134, 0.6324, 0.0975, 0.2785, 0.5469, 0.9575, 0.9649]

        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_trndtst_qc'], [1] * 10)


    def check_propagate(self):
        log.info('check_propagate')
        self.new_rdt()
        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25]
        self.rdt['tempwat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        np.testing.assert_array_equal(self.rdt['cmbnflg_qc'], [0, 1, 1, 1, 1, 1, 1, 0])
    
    def check_gradient(self):
        log.info('check_gradient')
        self.new_rdt()
        self.svm.stored_value_cas('grad_QCTEST_TEMPWAT_time', {'d_dat_dx': 50, 'min_dx': 0, 'start_dat': 0, 'tol_dat': 5})
        self.rdt['time'] = np.arange(5)
        self.rdt['temp'] = [3, 5, 98, 99, 4]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_gradtst_qc'], [1, 1, 0, 0, 1])

    def check_localrange(self):
        log.info('check_localrange')
        self.new_rdt()
        t = np.array([3580144703.7555027, 3580144704.7555027, 3580144705.7555027, 3580144706.7555027, 3580144707.7555027, 3580144708.7555027, 3580144709.7555027, 3580144710.7555027, 3580144711.7555027, 3580144712.7555027])
        pressure = np.random.rand(10) * 2 + 33.0
        t_v = ntp_to_month(t)
        dat = t_v + pressure + np.arange(16,26)
        def lim1(p,m):
            return p+m+10
        def lim2(p,m):
            return p+m+20

        pressure_grid, month_grid = np.meshgrid(np.arange(0,150,10), np.arange(11))
        points = np.column_stack([pressure_grid.flatten(), month_grid.flatten()])
        datlim_0 = lim1(points[:,0], points[:,1])
        datlim_1 = lim2(points[:,0], points[:,1])
        datlim = np.column_stack([datlim_0, datlim_1])
        datlimz = points

        self.svm.stored_value_cas('lrt_QCTEST_TEMPWAT', {'datlim':datlim.tolist(), 'datlimz':datlimz.tolist(), 'dims':['pressure', 'month']})
        self.rdt['time'] = t
        self.rdt['temp'] = dat
        self.rdt['pressure'] = pressure
        
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_loclrng_qc'], [1 ,1 ,1 ,1 ,1 ,0 ,0 ,0 ,0 ,0])
예제 #36
0
    def test_lookup_values_ingest_replay(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsub_management.create_stream_definition(
            'lookups', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition,
                        stream_def_id)

        stream_id, route = self.pubsub_management.create_stream(
            'example',
            exchange_point=self.exchange_point_name,
            stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        config = DotDict()
        config.process.lookup_docs = ['test1', 'test2']
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id,
            ingestion_configuration_id=ingestion_config_id,
            dataset_id=dataset_id,
            config=config)
        self.addCleanup(self.ingestion_management.unpersist_data_stream,
                        stream_id, ingestion_config_id)

        stored_value_manager = StoredValueManager(self.container)
        stored_value_manager.stored_value_cas('test1', {
            'offset_a': 10.0,
            'offset_b': 13.1
        })

        publisher = StandaloneStreamPublisher(stream_id, route)
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = [20.0] * 20

        granule = rdt.to_granule()

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20))
        np.testing.assert_array_almost_equal(rdt_out['temp'],
                                             np.array([20.] * 20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'],
                                             np.array([30.] * 20))
        np.testing.assert_array_equal(
            rdt_out['offset_b'],
            np.array([rdt_out.fill_value('offset_b')] * 20))

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20, 40)
        rdt['temp'] = [20.0] * 20
        granule = rdt.to_granule()

        dataset_monitor.event.clear()

        stored_value_manager.stored_value_cas('test1', {'offset_a': 20.0})
        stored_value_manager.stored_value_cas('coefficient_document',
                                              {'offset_b': 10.0})
        gevent.sleep(2)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40))
        np.testing.assert_array_almost_equal(rdt_out['temp'],
                                             np.array([20.] * 20 + [20.] * 20))
        np.testing.assert_array_equal(rdt_out['offset_b'],
                                      np.array([10.] * 40))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'],
                                             np.array([30.] * 20 + [40.] * 20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated_b'],
                                             np.array([40.] * 20 + [50.] * 20))
예제 #37
0
class TransformPrime(TransformDataProcess):
    binding=['output']
    '''
    Transforms which have an incoming stream and an outgoing stream.

    Parameters:
      process.stream_id      Outgoing stream identifier.
      process.exchange_point Route's exchange point.
      process.routing_key    Route's routing key.
      process.queue_name     Name of the queue to listen on.
      process.routes         streams,actor for each route {(stream_input_id, stream_output_id):actor} 
    Either the stream_id or both the exchange_point and routing_key need to be provided.
    '''    
    def on_start(self):
        TransformDataProcess.on_start(self)
        self.pubsub_management = PubsubManagementServiceProcessClient(process=self)
        self.stored_values = StoredValueManager(self.container)
        self.input_data_product_ids = self.CFG.get_safe('process.input_products', [])
        self.output_data_product_ids = self.CFG.get_safe('process.output_products', [])
        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent,callback=self._add_lookups, auto_delete=True)
        self.lookup_monitor.start()

    def on_quit(self):
        self.lookup_monitor.stop()
        TransformDataProcess.on_quit(self)

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin in self.input_data_product_ids + self.output_data_product_ids:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)


    @memoize_lru(100)
    def read_stream_def(self,stream_id):
        return self.pubsub_management.read_stream_definition(stream_id=stream_id)

    
    def recv_packet(self, msg, stream_route, stream_id):
        process_routes = self.CFG.get_safe('process.routes', {})
        for stream_in_id,routes in process_routes.iteritems():
            if stream_id == stream_in_id:
                for stream_out_id, actor in routes.iteritems():
                    if actor is None:
                        rdt_out = self._execute_transform(msg, (stream_in_id, stream_out_id))
                        self.publish(rdt_out.to_granule(), stream_out_id)
                    else:
                        outgoing = self._execute_actor(msg, actor, (stream_in_id, stream_out_id))
                        self.publish(outgoing, stream_out_id)

    def publish(self, msg, stream_out_id):
        publisher = getattr(self, stream_out_id)
        publisher.publish(msg)

    def _load_actor(self, actor):
        '''
        Returns callable execute method if it exists, otherwise it raises a BadRequest
        '''
        try:
            module = __import__(actor['module'], fromlist=[''])
        except ImportError:
            log.exception('Actor could not be loaded')
            raise
        try:
            cls = getattr(module, actor['class'])
        except AttributeError:
            log.exception('Module %s does not have class %s', repr(module), actor['class'])
            raise
        try:
            execute = getattr(cls,'execute')
        except AttributeError:
            log.exception('Actor class does not contain execute method')
            raise
        return execute

   
    def _execute_actor(self, msg, actor, streams):
        stream_in_id,stream_out_id = streams
        stream_def_out = self.read_stream_def(stream_out_id)
        params = self.CFG.get_safe('process.params', {})
        config = self.CFG.get_safe('process')
        #do the stuff with the actor
        params['stream_def'] = stream_def_out._id
        executor = self._load_actor(actor)
        try:
            rdt_out = executor(msg, None, config, params, None)
        except:
            log.exception('Error running actor for %s', self.id)
            raise
        return rdt_out

    def _merge_pdicts(self, pdict1, pdict2):
        incoming_pdict = ParameterDictionary.load(pdict1)
        outgoing_pdict = ParameterDictionary.load(pdict2)
        
        merged_pdict = ParameterDictionary()
        for k,v in incoming_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        for k,v in outgoing_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        return merged_pdict

    def _merge_rdt(self, stream_def_in, stream_def_out):
        incoming_pdict_dump = stream_def_in.parameter_dictionary
        outgoing_pdict_dump = stream_def_out.parameter_dictionary

        merged_pdict = self._merge_pdicts(incoming_pdict_dump, outgoing_pdict_dump)
        rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict)
        return rdt_temp


    def _get_lookup_value(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs

        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_values.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')

        return None

    def _execute_transform(self, msg, streams):
        stream_in_id,stream_out_id = streams
        stream_def_in = self.read_stream_def(stream_in_id)
        stream_def_out = self.read_stream_def(stream_out_id)

        rdt_temp = self._merge_rdt(stream_def_in, stream_def_out)
        
        rdt_in = RecordDictionaryTool.load_from_granule(msg)
        for field in rdt_temp.fields:
            if not isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType):
                try:
                    rdt_temp[field] = rdt_in[field]
                except KeyError:
                    pass

        rdt_temp.fetch_lookup_values()

        for lookup_field in rdt_temp.lookup_values():
            s = lookup_field
            stored_value = self._get_lookup_value(rdt_temp.context(s).lookup_value)
            if stored_value is not None:
                rdt_temp[s] = stored_value
        
        for field in rdt_temp.fields:
            if isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType):
                rdt_temp[field] = rdt_temp[field]

        
        rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id)

        for field in rdt_out.fields:
            rdt_out[field] = rdt_temp[field]
        
        return rdt_out 
class ScienceGranuleIngestionWorker(TransformStreamListener):
    CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5)

    def __init__(self, *args,**kwargs):
        super(ScienceGranuleIngestionWorker, self).__init__(*args, **kwargs)
        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets  = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()

    def on_start(self): #pragma no cover
        super(ScienceGranuleIngestionWorker,self).on_start()
        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.lookup_monitor.start()
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None


    def on_quit(self): #pragma no cover
        super(ScienceGranuleIngestionWorker, self).on_quit()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
    
    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = ResourceRegistryServiceClient()
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True)
        if datasets:
            return datasets[0]
        return None
    
    def get_dataset(self,stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self,stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result


    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not field.endswith('_qc'):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.nonzero(values)
                            first_occurrence = topology[0][0]
                            ts = rdt[rdt.temporal_parameter][first_occurrence]
                            self.flag_qc_parameter(dataset_id, field, ts, {})
                except:
                    continue
    def flag_qc_parameter(self, dataset_id, parameter, temporal_value, configuration):
        self.qc_publisher.publish_event(origin=dataset_id, qc_parameter=parameter, temporal_value=temporal_value, configuration=configuration)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index+1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s', msg)
            return


        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2


    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir, exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)
    
    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value] 
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None


    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt._lookup_values():
            if rdt[field] is None:
                continue
            if not isinstance(rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue 
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k,v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)
    
        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time)
    
    def add_granule(self,stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info('Message attempting to be inserted into bad coverage: %s',
                     DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            
        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
        if gap_found:
            log.error('Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index)
            self.gap_coverage(stream_id)



        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s', stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error("Couldn't open coverage: %s",
                      DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug('%s: add_granule stream %s dataset %s coverage %r file %s',
                      self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error('Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)

        self.insert_sparse_values(coverage,rdt,stream_id)
        
        if debugging:
            timer.complete_step('checks') # lightweight ops, should be zero
        
        self.expand_coverage(coverage, elements, stream_id)
        
        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)
        
        if debugging:
            timer.complete_step('keys')
        
        DatasetManagementService._save_coverage(coverage)
        
        if debugging:
            timer.complete_step('save')
        
        start_index = coverage.num_timesteps - elements
        self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements))

        if gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)
        
        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY>0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
예제 #39
0
 def populate_qc_tables(self):
     svm = StoredValueManager(self.container)
     svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':-2., 'grt_max_value':40.})
     svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4})
     svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc': 0.1, 'spike_n':5, 'spike_l':5})
예제 #40
0
class TransformPrime(TransformDataProcess):
    binding = ['output']
    '''
    Transforms which have an incoming stream and an outgoing stream.

    Parameters:
      process.stream_id      Outgoing stream identifier.
      process.exchange_point Route's exchange point.
      process.routing_key    Route's routing key.
      process.queue_name     Name of the queue to listen on.
      process.routes         streams,actor for each route {(stream_input_id, stream_output_id):actor} 
    Either the stream_id or both the exchange_point and routing_key need to be provided.
    '''
    def on_start(self):
        TransformDataProcess.on_start(self)
        self.pubsub_management = PubsubManagementServiceProcessClient(
            process=self)
        self.stored_values = StoredValueManager(self.container)
        self.input_data_product_ids = self.CFG.get_safe(
            'process.input_products', [])
        self.output_data_product_ids = self.CFG.get_safe(
            'process.output_products', [])
        self.lookup_docs = self.CFG.get_safe('process.lookup_docs', [])
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(
            event_type=OT.ExternalReferencesUpdatedEvent,
            callback=self._add_lookups,
            auto_delete=True)
        self.lookup_monitor.start()

    def on_quit(self):
        self.lookup_monitor.stop()
        TransformDataProcess.on_quit(self)

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin in self.input_data_product_ids + self.output_data_product_ids:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    @memoize_lru(100)
    def read_stream_def(self, stream_id):
        return self.pubsub_management.read_stream_definition(
            stream_id=stream_id)

    def recv_packet(self, msg, stream_route, stream_id):
        process_routes = self.CFG.get_safe('process.routes', {})
        for stream_in_id, routes in process_routes.iteritems():
            if stream_id == stream_in_id:
                for stream_out_id, actor in routes.iteritems():
                    if actor is None:
                        rdt_out = self._execute_transform(
                            msg, (stream_in_id, stream_out_id))
                        self.publish(rdt_out.to_granule(), stream_out_id)
                    else:
                        outgoing = self._execute_actor(
                            msg, actor, (stream_in_id, stream_out_id))
                        self.publish(outgoing, stream_out_id)

    def publish(self, msg, stream_out_id):
        publisher = getattr(self, stream_out_id)
        publisher.publish(msg)

    def _load_actor(self, actor):
        '''
        Returns callable execute method if it exists, otherwise it raises a BadRequest
        '''
        try:
            module = __import__(actor['module'], fromlist=[''])
        except ImportError:
            log.exception('Actor could not be loaded')
            raise
        try:
            cls = getattr(module, actor['class'])
        except AttributeError:
            log.exception('Module %s does not have class %s', repr(module),
                          actor['class'])
            raise
        try:
            execute = getattr(cls, 'execute')
        except AttributeError:
            log.exception('Actor class does not contain execute method')
            raise
        return execute

    def _execute_actor(self, msg, actor, streams):
        stream_in_id, stream_out_id = streams
        stream_def_out = self.read_stream_def(stream_out_id)
        params = self.CFG.get_safe('process.params', {})
        config = self.CFG.get_safe('process')
        #do the stuff with the actor
        params['stream_def'] = stream_def_out._id
        executor = self._load_actor(actor)
        try:
            rdt_out = executor(msg, None, config, params, None)
        except:
            log.exception('Error running actor for %s', self.id)
            raise
        return rdt_out

    def _merge_pdicts(self, pdict1, pdict2):
        incoming_pdict = ParameterDictionary.load(pdict1)
        outgoing_pdict = ParameterDictionary.load(pdict2)

        merged_pdict = ParameterDictionary()
        for k, v in incoming_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        for k, v in outgoing_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        return merged_pdict

    def _merge_rdt(self, stream_def_in, stream_def_out):
        incoming_pdict_dump = stream_def_in.parameter_dictionary
        outgoing_pdict_dump = stream_def_out.parameter_dictionary

        merged_pdict = self._merge_pdicts(incoming_pdict_dump,
                                          outgoing_pdict_dump)
        rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict)
        return rdt_temp

    def _get_lookup_value(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs

        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_values.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')

        return None

    def _execute_transform(self, msg, streams):
        stream_in_id, stream_out_id = streams
        stream_def_in = self.read_stream_def(stream_in_id)
        stream_def_out = self.read_stream_def(stream_out_id)

        rdt_temp = self._merge_rdt(stream_def_in, stream_def_out)

        rdt_in = RecordDictionaryTool.load_from_granule(msg)
        for field in rdt_temp.fields:
            if not isinstance(
                    rdt_temp._pdict.get_context(field).param_type,
                    ParameterFunctionType):
                try:
                    rdt_temp[field] = rdt_in[field]
                except KeyError:
                    pass

        rdt_temp.fetch_lookup_values()

        for lookup_field in rdt_temp.lookup_values():
            s = lookup_field
            stored_value = self._get_lookup_value(
                rdt_temp.context(s).lookup_value)
            if stored_value is not None:
                rdt_temp[s] = stored_value

        for field in rdt_temp.fields:
            if isinstance(
                    rdt_temp._pdict.get_context(field).param_type,
                    ParameterFunctionType):
                rdt_temp[field] = rdt_temp[field]

        rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id)

        for field in rdt_out.fields:
            rdt_out[field] = rdt_temp[field]

        return rdt_out
예제 #41
0
 def on_start(self):
     TransformStreamListener.on_start(self)
     self.document_key = self.CFG.get_safe('process.document_key')
     self.stored_value_manager = StoredValueManager(self.container)
예제 #42
0
 def on_start(self):
     TransformStreamListener.on_start(self)
     self.document_key = self.CFG.get_safe('process.document_key')
     self.stored_value_manager = StoredValueManager(self.container)
class TestQCFunctions(DMTestCase):
    def setUp(self):
        DMTestCase.setUp(self)
        self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
        self.pdict_id = self.ph.create_simple_qc_pdict()
        self.svm = StoredValueManager(self.container)

    def new_rdt(self,ref='QCTEST'):
        self.stream_def_id = self.create_stream_definition(uuid4().hex, parameter_dictionary_id=self.pdict_id, stream_configuration={'reference_designator':'QCTEST'})
        self.rdt = RecordDictionaryTool(stream_definition_id=self.stream_def_id)

    def test_qc_functions(self):
        self.check_global_range()
        self.check_spike()
        self.check_stuck_value()
        self.check_trend()
        self.check_gradient()
        self.check_localrange()
        self.check_propagate()


    def check_global_range(self):
        log.info('check_global_range')
        self.new_rdt()
        self.svm.stored_value_cas('grt_QCTEST_TEMPWAT', {'grt_min_value':10., 'grt_max_value':20.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25]
        self.rdt.fetch_lookup_values()
        np.testing.assert_array_almost_equal(self.rdt['tempwat_glblrng_qc'], [0, 1, 1, 1, 1, 1, 1, 0])

    def check_spike(self): 
        log.info('check_spike')
        self.new_rdt()
        self.svm.stored_value_cas('spike_QCTEST_TEMPWAT', {'acc':0.1, 'spike_n':5., 'spike_l':5.})

        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [-1, 3, 40, -1, 1, -6, -6, 1]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_spketst_qc'], [1, 1, 0, 1, 1, 1, 1, 1])

    def check_stuck_value(self):
        log.info('check_stuck_value')
        self.new_rdt()
        self.svm.stored_value_cas('svt_QCTEST_TEMPWAT', {'svt_resolution':0.001, 'svt_n': 4.})

        self.rdt['time'] = np.arange(10)
        self.rdt['temp'] = [4.83, 1.40, 3.33, 3.33, 3.33, 3.33, 4.09, 2.97, 2.85, 3.67]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_almost_equal(self.rdt['tempwat_stuckvl_qc'], [1, 1, 0, 0, 0, 0, 1, 1, 1, 1])

    def check_trend(self):
        log.info('check_trend')
        self.new_rdt()
        self.svm.stored_value_cas('trend_QCTEST_TEMPWAT', {'time_interval':0, 'polynomial_order': 1, 'standard_deviation': 3})
        self.rdt['time'] = np.arange(10)
        self.rdt['temp'] = [0.8147, 0.9058, 0.1270, 0.9134, 0.6324, 0.0975, 0.2785, 0.5469, 0.9575, 0.9649]

        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_trndtst_qc'], [1] * 10)


    def check_propagate(self):
        log.info('check_propagate')
        self.new_rdt()
        self.rdt['time'] = np.arange(8)
        self.rdt['temp'] = [9, 10, 16, 17, 18, 19, 20, 25]
        self.rdt['tempwat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['tempwat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_glblrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_spketst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_stuckvl_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_gradtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_trndtst_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        self.rdt['preswat_loclrng_qc'] = [0, 1, 1, 1, 1, 1, 1, 0]
        np.testing.assert_array_equal(self.rdt['cmbnflg_qc'], [0, 1, 1, 1, 1, 1, 1, 0])
    
    def check_gradient(self):
        log.info('check_gradient')
        self.new_rdt()
        self.svm.stored_value_cas('grad_QCTEST_TEMPWAT_time', {'d_dat_dx': 50, 'min_dx': 0, 'start_dat': 0, 'tol_dat': 5})
        self.rdt['time'] = np.arange(5)
        self.rdt['temp'] = [3, 5, 98, 99, 4]
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_gradtst_qc'], [1, 1, 0, 0, 1])

    def check_localrange(self):
        log.info('check_localrange')
        self.new_rdt()
        t = np.array([3580144703.7555027, 3580144704.7555027, 3580144705.7555027, 3580144706.7555027, 3580144707.7555027, 3580144708.7555027, 3580144709.7555027, 3580144710.7555027, 3580144711.7555027, 3580144712.7555027])
        pressure = np.random.rand(10) * 2 + 33.0
        t_v = ntp_to_month(t)
        dat = t_v + pressure + np.arange(16,26)
        def lim1(p,m):
            return p+m+10
        def lim2(p,m):
            return p+m+20

        pressure_grid, month_grid = np.meshgrid(np.arange(0,150,10), np.arange(11))
        points = np.column_stack([pressure_grid.flatten(), month_grid.flatten()])
        datlim_0 = lim1(points[:,0], points[:,1])
        datlim_1 = lim2(points[:,0], points[:,1])
        datlim = np.column_stack([datlim_0, datlim_1])
        datlimz = points

        self.svm.stored_value_cas('lrt_QCTEST_TEMPWAT', {'datlim':datlim.tolist(), 'datlimz':datlimz.tolist(), 'dims':['pressure', 'month']})
        self.rdt['time'] = t
        self.rdt['temp'] = dat
        self.rdt['pressure'] = pressure
        
        self.rdt.fetch_lookup_values()

        np.testing.assert_array_equal(self.rdt['tempwat_loclrng_qc'], [1 ,1 ,1 ,1 ,1 ,0 ,0 ,0 ,0 ,0])
예제 #44
0
    def test_lookup_values_ingest_replay(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsub_management.create_stream_definition('lookups', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)

        stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        config = DotDict()
        config.process.lookup_docs = ['test1', 'test2']
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config)
        self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id)

        stored_value_manager = StoredValueManager(self.container)
        stored_value_manager.stored_value_cas('test1',{'offset_a':10.0, 'offset_b':13.1})
        
        publisher = StandaloneStreamPublisher(stream_id, route)
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = [20.0] * 20

        granule = rdt.to_granule()

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))
        
        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20))
        np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20))
        np.testing.assert_array_equal(rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20))

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20,40)
        rdt['temp'] = [20.0] * 20
        granule = rdt.to_granule()

        dataset_monitor.event.clear()

        stored_value_manager.stored_value_cas('test1',{'offset_a':20.0})
        stored_value_manager.stored_value_cas('coefficient_document',{'offset_b':10.0})
        gevent.sleep(2)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40))
        np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20))
        np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20 + [40.]*20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker):
    CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5)

    def __init__(self, *args,**kwargs):
        TransformStreamListener.__init__(self, *args, **kwargs)
        BaseIngestionWorker.__init__(self, *args, **kwargs)

        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets  = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()



    def on_start(self): #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)
        
        self.queue_name = self.CFG.get_safe('process.queue_name',self.id)
        self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback)
        self.thread_lock = RLock()
        
        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True)
        if not self.ignore_gaps:
            log.warning("Gap handling is not supported in release 2")
        self.ignore_gaps = True
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None
        
        self.start_listener()

    def on_quit(self): #pragma no cover
        self.event_publisher.close()
        self.qc_publisher.close()
        if self.subscriber_thread:
            self.stop_listener()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
        self._coverages.clear()
        TransformStreamListener.on_quit(self)
        BaseIngestionWorker.on_quit(self)

    
    def start_listener(self):
        # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering
        with self.thread_lock:
            self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id)

    def stop_listener(self):
        # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one)
        with self.thread_lock:
            self.subscriber.close()
            self.subscriber_thread.join(timeout=10)
            for stream, coverage in self._coverages.iteritems():
                try:
                    coverage.close(timeout=5)
                except:
                    log.exception('Problems closing the coverage')
            self._coverages.clear()
            self.subscriber_thread = None

    def pause(self):
        if self.subscriber_thread is not None:
            self.stop_listener()


    def resume(self):
        if self.subscriber_thread is None:
            self.start_listener()


    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = self.container.resource_registry
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True)
        if datasets:
            return datasets[0]
        return None

    def _get_data_products(self, dataset_id):
        rr_client = self.container.resource_registry
        data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False)
        return data_products


    def initialize_metadata(self, dataset_id, rdt):
        '''
        Initializes a metadata document in the object store. The document
        contains information about the bounds and extents of the dataset as
        well other metadata to improve performance.

        '''

        object_store = self.container.object_store
        key = dataset_id
        bounds = {}
        extents = {}
        last_values = {}
        rough_size = 0
        for k,v in rdt.iteritems():
            v = v[:].flatten()
            if v.dtype.char not in ('S', 'O', 'U', 'V'):
                bounds[k] = (np.min(v), np.max(v))
                last_values[k] = v[-1]
            extents[k] = len(rdt)
            rough_size += len(rdt) * 4

        doc = {'bounds':bounds, 'extents':extents, 'last_values':last_values, 'size': rough_size}
        doc = numpy_walk(doc)
        object_store.create_doc(doc, object_id=key)
        return 

    def update_metadata(self, dataset_id, rdt):
        '''
        Updates the metada document with the latest information available
        '''

        self.update_data_product_metadata(dataset_id, rdt)

        # Grab the document
        object_store = self.container.object_store
        key = dataset_id
        try:
            doc = object_store.read_doc(key)
        except NotFound:
            return self.initialize_metadata(dataset_id, rdt)
        # These are the fields we're interested in
        bounds = doc['bounds']
        extents = doc['extents']
        last_values = doc['last_values']
        rough_size = doc['size']
        for k,v in rdt.iteritems():
            if k not in bounds:
                continue

            v = v[:].flatten() # Get the numpy representation (dense array).
            if v.dtype.char not in ('S', 'O', 'U', 'V'):
                l_min = np.min(v)
                l_max = np.max(v)
                o_min, o_max = bounds[k]
                bounds[k] = (min(l_min, o_min), max(l_max, o_max))
                last_values[k] = v[-1]
            # Update the bounds
            # Increase the extents
            extents[k] = extents[k] + len(rdt)
            # How about the last value?

            rough_size += len(rdt) * 4
            doc['size'] = rough_size
        # Sanitize it
        doc = numpy_walk(doc)
        object_store.update_doc(doc)

    def update_data_product_metadata(self, dataset_id, rdt):
        data_products = self._get_data_products(dataset_id)
        for data_product in data_products:
            self.update_time(data_product, rdt[rdt.temporal_parameter][:])
            self.update_geo(data_product, rdt)
            self.container.resource_registry.update(data_product)

    def update_time(self, data_product, t):
        #TODO: Account for non NTP-based timestamps
        t_min = np.min(t)
        t_min -= 2208988800
        t_max = np.max(t)
        t_max -= 2208988800

        if not data_product.nominal_datetime.start_datetime:
            data_product.nominal_datetime.start_datetime = t_min
        data_product.nominal_datetime.end_datetime = t_max

    def update_geo(self, data_product, rdt):
        lat = None
        lon = None
        for p in rdt:
            if rdt._rd[p] is None:
                continue
            # TODO: Not an all encompassing list of acceptable names for lat and lon
            if p.lower() in ('lat', 'latitude', 'y_axis'):
                lat = np.asscalar(rdt[p][-1])
            elif p.lower() in ('lon', 'longitude', 'x_axis'):
                lon = np.asscalar(rdt[p][-1])
            if lat and lon:
                break

        if lat and lon:
            data_product.geospatial_bounds.geospatial_latitude_limit_north = lat
            data_product.geospatial_bounds.geospatial_latitude_limit_south = lat
            data_product.geospatial_bounds.geospatial_longitude_limit_east = lon
            data_product.geospatial_bounds.geospatial_longitude_limit_west = lon


    
    def get_dataset(self,stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self,stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result


    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.where(values==0)
                            timestamps = rdt[rdt.temporal_parameter][topology[0]]
                            self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {})
                except:
                    continue
    def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration):
        data_product_ids, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True)
        for data_product_id in data_product_ids:
            description = 'Automated Quality Control Alerted on %s' % parameter
            self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index+1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s', msg)
            return


        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2


    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir, exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)
    
    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value] 
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None


    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt.fields:
            if rdt._rd[field] is None:
                continue
            if not isinstance(rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue 
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except ValueError as e:
                if "'lower_bound' cannot be >= 'upper_bound'" in e.message:
                    continue
                else:
                    raise
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k,v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)
            except IndexError as e:
                log.error("Value set: %s", v[:])
                data_products, _ = self.container.resource_registry.find_subjects(object=stream_id, predicate=PRED.hasStream, subject_type=RT.DataProduct)
                for data_product in data_products:
                    log.exception("Index exception with %s, trying to insert %s into coverage with shape %s", 
                                  data_product.name,
                                  k,
                                  v.shape)

    
        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time)
    
    def add_granule(self,stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info('Message attempting to be inserted into bad coverage: %s',
                     DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            
        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        if not self.ignore_gaps:
            gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
            if gap_found:
                log.warning('Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index)
                self.gap_coverage(stream_id)



        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s', stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error("Couldn't open coverage: %s",
                      DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug('%s: add_granule stream %s dataset %s coverage %r file %s',
                      self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error('Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)
        if rdt[rdt.temporal_parameter] is None:
            elements = 0 

        self.insert_sparse_values(coverage,rdt,stream_id)
        
        if debugging:
            timer.complete_step('checks') # lightweight ops, should be zero
        
        self.expand_coverage(coverage, elements, stream_id)
        
        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)
        
        if debugging:
            timer.complete_step('keys')
        
        DatasetManagementService._save_coverage(coverage)
        
        if debugging:
            timer.complete_step('save')
        
        start_index = coverage.num_timesteps - elements

        if not self.ignore_gaps and gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)
        
        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

        self.update_metadata(dataset_id, rdt)
        self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements))

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY>0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
    def test_lookup_values(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsubcli.create_stream_definition('lookup', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition, stream_def_id)

        data_product = DataProduct(name='lookup data product')
        tdom, sdom = time_series_domain()
        data_product.temporal_domain = tdom.dump()
        data_product.spatial_domain = sdom.dump()

        data_product_id = self.dpsc_cli.create_data_product(data_product, stream_definition_id=stream_def_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, data_product_id)
        data_producer = DataProducer(name='producer')
        data_producer.producer_context = DataProcessProducerContext()
        data_producer.producer_context.configuration['qc_keys'] = ['offset_document']
        data_producer_id, _ = self.rrclient.create(data_producer)
        self.addCleanup(self.rrclient.delete, data_producer_id)
        assoc,_ = self.rrclient.create_association(subject=data_product_id, object=data_producer_id, predicate=PRED.hasDataProducer)
        self.addCleanup(self.rrclient.delete_association, assoc)

        document_keys = self.damsclient.list_qc_references(data_product_id)
            
        self.assertEquals(document_keys, ['offset_document'])
        svm = StoredValueManager(self.container)
        svm.stored_value_cas('offset_document', {'offset_a':2.0})
        self.dpsc_cli.activate_data_product_persistence(data_product_id)
        dataset_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasDataset, id_only=True)
        dataset_id = dataset_ids[0]

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [0]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()

        stream_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasStream, id_only=True)
        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        publisher = StandaloneStreamPublisher(stream_id, route)
        publisher.publish(granule)

        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['temp'], rdt2['temp'])
        np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0]))


        svm.stored_value_cas('updated_document', {'offset_a':3.0})
        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)
        ep = EventPublisher(event_type=OT.ExternalReferencesUpdatedEvent)
        ep.publish_event(origin=data_product_id, reference_keys=['updated_document'])

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [1]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()
        gevent.sleep(2) # Yield so that the event goes through
        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt2['temp'],np.array([20.,20.]))
        np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0,23.0]))
예제 #47
0
 def setUp(self):
     DMTestCase.setUp(self)
     self.ph = ParameterHelper(self.dataset_management, self.addCleanup)
     self.pdict_id = self.ph.create_simple_qc_pdict()
     self.svm = StoredValueManager(self.container)