def test_qc_attachment(self):
        instrument_device = InstrumentDevice(name='whatever')
        instrument_device_id,_ = self.rrclient.create(instrument_device)
        self.addCleanup(self.rrclient.delete, instrument_device_id)
        self.client.register_instrument(instrument_device_id)
        self.addCleanup(self.client.unregister_instrument, instrument_device_id)
        dp = DataProduct(name='instrument output')

        dp_id,_ = self.rrclient.create(dp)
        self.addCleanup(self.rrclient.delete, dp_id)

        parser_id = self.make_grt_parser()
        attachment = Attachment(name='qc ref', attachment_type=AttachmentType.REFERENCE,content=global_range_test_document, context=ReferenceAttachmentContext(parser_id=parser_id))
        att_id = self.rrclient.create_attachment(dp_id, attachment)
        self.addCleanup(self.rrclient.delete_attachment, att_id)

        attachment2 = Attachment(name='qc ref2', attachment_type=AttachmentType.REFERENCE, content=global_range_test_document2, context=ReferenceAttachmentContext(parser_id=parser_id))
        att2_id = self.rrclient.create_attachment(dp_id, attachment2)
        self.addCleanup(self.rrclient.delete_attachment, att2_id)

        self.client.assign_data_product(instrument_device_id, dp_id)
        self.addCleanup(self.client.unassign_data_product, instrument_device_id, dp_id)
        svm = StoredValueManager(self.container)
        doc = svm.read_value('grt_TEST_TEMPWAT_TEMPWAT')
        np.testing.assert_array_almost_equal(doc['grt_min_value'], 10.)

        doc = svm.read_value('grt_TEST_PRACSAL_PRACSAL')
        np.testing.assert_array_almost_equal(doc['grt_min_value'], 0.)
예제 #2
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path,'r') as f:
            document = f.read()
        for k,v in parser(document):
            svm.stored_value_cas(k,v)
        return


    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value('grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value('spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN'))


    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck'])

        ret_doc = self.svm.read_value('svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value('trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0)

    
    def test_lrt_parser(self):
        self.parse_document(self.container, lrt_parser, qc_paths['lrt'])

        ret_doc = self.svm.read_value('lrt_%s_%s' %('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL'))

        np.testing.assert_array_equal(ret_doc['datlim'][0], np.array([32.289, 32.927]))
        self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
    def test_global_range_test_parser(self):
        svm = StoredValueManager(self.container)
        for key,doc in grt_parser(grt_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('grt_sbe37-abc123_TEMPWAT_UHHH')
        self.assertEquals(doc['grt_min_value'], 0.)
        self.assertEquals(doc['array'],'array 1')
        doc = svm.read_value('grt_sbe37-abc123_PRESWAT_flagged')
        self.assertEquals(doc['grt_max_value'], 689.47)
    def test_stuck_value_test(self):
        svm = StoredValueManager(self.container)
        for key,doc in stuck_value_test_parser(stuck_value_test_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('svt_ssxbt-ssn719_PRESSURE')
        self.assertEquals(doc['svt_resolution'], 1.2)
        self.assertEquals(doc['svt_n'], 10.)

        doc = svm.read_value('svt_ssxbt-ssn719_COND')
        self.assertEquals(doc['svt_resolution'], 0.2)
        self.assertEquals(doc['units'], 'S m-1')
예제 #5
0
    def test_stuck_value_test(self):
        svm = StoredValueManager(self.container)
        for key, doc in trend_parser(trend_value_test_sample_doc):
            svm.stored_value_cas(key, doc)
            self.addCleanup(svm.delete_stored_value, key)

        doc = svm.read_value("trend_ssxbt-ssn719_PRESSURE")
        self.assertEquals(doc["time_interval"], 1.0)
        self.assertEquals(doc["standard_deviation"], 0.0)

        doc = svm.read_value("trend_ssxbt-ssn719_COND")
        self.assertEquals(doc["time_interval"], 3.0)
        self.assertEquals(doc["polynomial_order"], "third")
    def test_gradient_test(self):
        svm = StoredValueManager(self.container)
        for key,doc in gradient_test_parser(gradient_test_sample_doc):
            svm.stored_value_cas(key,doc)
            self.addCleanup(svm.delete_stored_value,key)

        doc = svm.read_value('grad_CHEL-C-32-12_PRESSURE_DENSITY')
        self.assertEquals(doc['units_dat'], 'dbar')
        self.assertEquals(doc['d_dat_dx'], 2.781)

        doc = svm.read_value('grad_CHEW-BA-C-A12_PRESSURE_DENSITY')
        self.assertEquals(doc['units_x'], 'kg m-3')
        self.assertEquals(doc['tol_dat'], 12.)
    def test_basic_stored_docs(self):

        doc = {'key':'value'}
        doc_key = 'doc_id'

        svm = StoredValueManager(self.container)
        doc_id, rev = svm.stored_value_cas(doc_key, doc)
        self.addCleanup(svm.delete_stored_value, doc_key)

        doc = svm.read_value(doc_key)
        self.assertTrue('key' in doc and doc['key']=='value')
        svm.stored_value_cas(doc_key,{'key2':'value2'})
        doc = svm.read_value(doc_key)
        self.assertTrue('key' in doc and doc['key']=='value')
        self.assertTrue('key2' in doc and doc['key2']=='value2')
예제 #8
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path,'r') as f:
            document = f.read()
        for k,v in parser(document):
            svm.stored_value_cas(k,v)
        return


    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value('grt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value('spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(NotFound, self.svm.read_value,'spike_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'DENSITY'))


    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser, qc_paths['stuck'])

        ret_doc = self.svm.read_value('svt_%s_%s' % ('GA01SUMO-RI003-03-CTDMOQ999', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 0.0001)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 20.)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value('trend_%s_%s' % ('RS01SBVM-LJ01A-05-HPIESA101', 'IESPRES'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 90.)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 5.0)
    def test_qc_attachment(self):
        instrument_device = InstrumentDevice(name='whatever')
        instrument_device_id,_ = self.rrclient.create(instrument_device)
        self.addCleanup(self.rrclient.delete, instrument_device_id)
        self.client.register_instrument(instrument_device_id)
        self.addCleanup(self.client.unregister_instrument, instrument_device_id)
        dp = DataProduct(name='instrument output')

        dp_id,_ = self.rrclient.create(dp)
        self.addCleanup(self.rrclient.delete, dp_id)

        parser_id = self.make_grt_parser()
        attachment = Attachment(name='qc ref', attachment_type=AttachmentType.REFERENCE,content=global_range_test_document, context=ReferenceAttachmentContext(parser_id=parser_id))
        att_id = self.rrclient.create_attachment(dp_id, attachment)
        self.addCleanup(self.rrclient.delete_attachment, att_id)

        attachment2 = Attachment(name='qc ref2', attachment_type=AttachmentType.REFERENCE, content=global_range_test_document2, context=ReferenceAttachmentContext(parser_id=parser_id))
        att2_id = self.rrclient.create_attachment(dp_id, attachment2)
        self.addCleanup(self.rrclient.delete_attachment, att2_id)

        self.client.assign_data_product(instrument_device_id, dp_id)
        self.addCleanup(self.client.unassign_data_product, instrument_device_id, dp_id)
        svm = StoredValueManager(self.container)
        doc = svm.read_value('grt_CE01ISSM-MF005-01-CTDBPC999_TEMPWAT')
        np.testing.assert_array_almost_equal(doc['grt_min_value'], -2.)
예제 #10
0
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(context.document_key)
             except NotFound:
                 continue
             if context.lookup_value in doc:
                 self[lv] = doc[context.lookup_value]
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             document_key = context.document_key
             if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                 document_key = document_key.replace('$designator',self._stream_config['reference_designator'])
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(document_key)
             except NotFound:
                 log.debug('Reference Document for %s not found', document_key)
                 continue
             if context.lookup_value in doc:
                 self[lv] = [doc[context.lookup_value]] * self._shp[0] if self._shp else doc[context.lookup_value]
예제 #12
0
 def fetch_lookup_values(self):
     for lv in self._lookup_values():
         context = self.context(lv)
         if context.document_key:
             document_key = context.document_key
             if '$designator' in context.document_key and 'reference_designator' in self._stream_config:
                 document_key = document_key.replace(
                     '$designator',
                     self._stream_config['reference_designator'])
             svm = StoredValueManager(Container.instance)
             try:
                 doc = svm.read_value(document_key)
             except NotFound:
                 log.debug('Reference Document for %s not found',
                           document_key)
                 continue
             if context.lookup_value in doc:
                 self[lv] = [doc[context.lookup_value]
                             ] * self._shp[0] if self._shp else doc[
                                 context.lookup_value]
class ScienceGranuleIngestionWorker(TransformStreamListener):
    CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5)

    def __init__(self, *args,**kwargs):
        super(ScienceGranuleIngestionWorker, self).__init__(*args, **kwargs)
        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets  = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()

    def on_start(self): #pragma no cover
        super(ScienceGranuleIngestionWorker,self).on_start()
        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.lookup_monitor.start()
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None


    def on_quit(self): #pragma no cover
        super(ScienceGranuleIngestionWorker, self).on_quit()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
    
    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = ResourceRegistryServiceClient()
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True)
        if datasets:
            return datasets[0]
        return None
    
    def get_dataset(self,stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self,stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result


    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not field.endswith('_qc'):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.nonzero(values)
                            first_occurrence = topology[0][0]
                            ts = rdt[rdt.temporal_parameter][first_occurrence]
                            self.flag_qc_parameter(dataset_id, field, ts, {})
                except:
                    continue
    def flag_qc_parameter(self, dataset_id, parameter, temporal_value, configuration):
        self.qc_publisher.publish_event(origin=dataset_id, qc_parameter=parameter, temporal_value=temporal_value, configuration=configuration)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index+1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s', msg)
            return


        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2


    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir, exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)
    
    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value] 
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None


    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt._lookup_values():
            if rdt[field] is None:
                continue
            if not isinstance(rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue 
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k,v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)
    
        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time)
    
    def add_granule(self,stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info('Message attempting to be inserted into bad coverage: %s',
                     DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            
        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
        if gap_found:
            log.error('Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index)
            self.gap_coverage(stream_id)



        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s', stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error("Couldn't open coverage: %s",
                      DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug('%s: add_granule stream %s dataset %s coverage %r file %s',
                      self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error('Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)

        self.insert_sparse_values(coverage,rdt,stream_id)
        
        if debugging:
            timer.complete_step('checks') # lightweight ops, should be zero
        
        self.expand_coverage(coverage, elements, stream_id)
        
        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)
        
        if debugging:
            timer.complete_step('keys')
        
        DatasetManagementService._save_coverage(coverage)
        
        if debugging:
            timer.complete_step('save')
        
        start_index = coverage.num_timesteps - elements
        self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements))

        if gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)
        
        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY>0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
예제 #14
0
class TestParsers(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.svm = StoredValueManager(self.container)

    @classmethod
    def parse_document(cls, container, parser, document_path):
        svm = StoredValueManager(container)
        document = ''
        with open(document_path, 'r') as f:
            document = f.read()
        for k, v in parser(document):
            svm.stored_value_cas(k, v)
        return

    def test_grt_parser(self):
        self.parse_document(self.container, grt_parser, qc_paths['grt'])

        ret_doc = self.svm.read_value(
            'grt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CONDWAT'))
        np.testing.assert_almost_equal(ret_doc['grt_min_value'], 0.)
        np.testing.assert_almost_equal(ret_doc['grt_max_value'], 66000.)

    def test_spike_parser(self):
        self.parse_document(self.container, spike_parser, qc_paths['spike'])

        ret_doc = self.svm.read_value(
            'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['acc'], 0.005)
        np.testing.assert_almost_equal(ret_doc['spike_n'], 11.)
        np.testing.assert_almost_equal(ret_doc['spike_l'], 15.)

        self.assertRaises(
            NotFound, self.svm.read_value,
            'spike_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'CLOWN'))

    def test_stuck_value_parser(self):
        self.parse_document(self.container, stuck_value_test_parser,
                            qc_paths['stuck'])

        ret_doc = self.svm.read_value(
            'svt_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['svt_resolution'], 1e-9)
        np.testing.assert_almost_equal(ret_doc['svt_n'], 1000000)

    def test_trend_value_parser(self):
        self.parse_document(self.container, trend_parser, qc_paths['trend'])

        ret_doc = self.svm.read_value(
            'trend_%s_%s' % ('CE04OSBP-LJ01C-06-CTDBPO108', 'PRACSAL'))
        np.testing.assert_almost_equal(ret_doc['time_interval'], 365.0)
        np.testing.assert_almost_equal(ret_doc['polynomial_order'], 1.0)
        np.testing.assert_almost_equal(ret_doc['standard_deviation'], 0.0)

    def test_lrt_parser(self):
        self.parse_document(self.container, lrt_parser, qc_paths['lrt'])

        ret_doc = self.svm.read_value(
            'lrt_%s_%s' % ('GP02HYPM-SP001-04-CTDPF0999', 'PRACSAL'))

        np.testing.assert_array_equal(ret_doc['datlim'][0],
                                      np.array([32.289, 32.927]))
        self.assertEquals(ret_doc['dims'], ['pressure', 'month'])
class ScienceGranuleIngestionWorker(TransformStreamListener,
                                    BaseIngestionWorker):
    CACHE_LIMIT = CFG.get_safe('container.ingestion_cache', 5)

    def __init__(self, *args, **kwargs):
        TransformStreamListener.__init__(self, *args, **kwargs)
        BaseIngestionWorker.__init__(self, *args, **kwargs)

        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()

    def on_start(self):  #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)

        self.queue_name = self.CFG.get_safe('process.queue_name', self.id)
        self.subscriber = StreamSubscriber(process=self,
                                           exchange_name=self.queue_name,
                                           callback=self.receive_callback)
        self.thread_lock = RLock()

        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(
            from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs', [])
        self.input_product = self.CFG.get_safe('process.input_product', '')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps',
                                             False)
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(
            event_type=OT.ExternalReferencesUpdatedEvent,
            callback=self._add_lookups,
            auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None

        self.start_listener()

    def on_quit(self):  #pragma no cover
        self.event_publisher.close()
        self.qc_publisher.close()
        if self.subscriber_thread:
            self.stop_listener()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
        self._coverages.clear()
        TransformStreamListener.on_quit(self)
        BaseIngestionWorker.on_quit(self)

    def start_listener(self):
        # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering
        with self.thread_lock:
            self.subscriber_thread = self._process.thread_manager.spawn(
                self.subscriber.listen, thread_name='%s-subscriber' % self.id)

    def stop_listener(self):
        # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one)
        with self.thread_lock:
            self.subscriber.close()
            self.subscriber_thread.join(timeout=10)
            for stream, coverage in self._coverages.iteritems():
                try:
                    coverage.close(timeout=5)
                except:
                    log.exception('Problems closing the coverage')
            self._coverages.clear()
            self.subscriber_thread = None

    def pause(self):
        if self.subscriber_thread is not None:
            self.stop_listener()

    def resume(self):
        if self.subscriber_thread is None:
            self.start_listener()

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = ResourceRegistryServiceClient()
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,
                                              predicate=PRED.hasStream,
                                              object=stream_id,
                                              id_only=True)
        if datasets:
            return datasets[0]
        return None

    def get_dataset(self, stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id,
                                                                    mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self, stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(
                dataset_id, old_cov.parameter_dictionary, sdom, tdom,
                old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result

    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id,
                                           author=self.id,
                                           extents=extents,
                                           window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not (field.endswith('glblrng_qc')
                        or field.endswith('loclrng_qc')):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.where(values == 0)
                            timestamps = rdt[rdt.temporal_parameter][
                                topology[0]]
                            self.flag_qc_parameter(dataset_id, field,
                                                   timestamps.tolist(), {})
                except:
                    continue

    def flag_qc_parameter(self, dataset_id, parameter, temporal_values,
                          configuration):
        data_product_ids, _ = self.container.resource_registry.find_subjects(
            object=dataset_id,
            predicate=PRED.hasDataset,
            subject_type=RT.DataProduct,
            id_only=True)
        for data_product_id in data_product_ids:
            description = 'Automated Quality Control Alerted on %s' % parameter
            self.qc_publisher.publish_event(origin=data_product_id,
                                            qc_parameter=parameter,
                                            temporal_values=temporal_values,
                                            configuration=configuration,
                                            description=description)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index + 1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s',
                      msg)
            return

        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() -
                        start) > MAX_RETRY_TIME:  # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error(
                        "We're giving up, the coverage needs to be inspected %s",
                        DatasetManagementService._get_coverage_path(
                            dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2

    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir,
                      exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)

    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None

    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt.fields:
            if rdt[field] is None:
                continue
            if not isinstance(
                    rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except ValueError as e:
                if "'lower_bound' cannot be >= 'upper_bound'" in e.message:
                    continue
                else:
                    raise
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir,
                          exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k, v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k,
                                              tdoa=slice_,
                                              value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir,
                          exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(
                coverage.get_parameter_context('ingestion_timestamp').uom,
                t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp',
                                          tdoa=slice_,
                                          value=ntp_time)

    def add_granule(self, stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info(
                'Message attempting to be inserted into bad coverage: %s',
                DatasetManagementService._get_coverage_path(
                    self.get_dataset(stream_id)))

        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        if not self.ignore_gaps:
            gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
            if gap_found:
                log.error(
                    'Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)',
                    rdt.connection_id, rdt.connection_index,
                    self.connection_id, self.connection_index)
                self.gap_coverage(stream_id)

        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s',
                      stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error(
                "Couldn't open coverage: %s",
                DatasetManagementService._get_coverage_path(
                    self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug(
                '%s: add_granule stream %s dataset %s coverage %r file %s',
                self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error(
                'Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)
        if rdt[rdt.temporal_parameter] is None:
            elements = 0

        self.insert_sparse_values(coverage, rdt, stream_id)

        if debugging:
            timer.complete_step('checks')  # lightweight ops, should be zero

        self.expand_coverage(coverage, elements, stream_id)

        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)

        if debugging:
            timer.complete_step('keys')

        DatasetManagementService._save_coverage(coverage)

        if debugging:
            timer.complete_step('save')

        start_index = coverage.num_timesteps - elements
        self.dataset_changed(dataset_id, coverage.num_timesteps,
                             (start_index, start_index + elements))

        if not self.ignore_gaps and gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)

        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY > 0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step,
                          self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
예제 #16
0
class TransformPrime(TransformDataProcess):
    binding=['output']
    '''
    Transforms which have an incoming stream and an outgoing stream.

    Parameters:
      process.stream_id      Outgoing stream identifier.
      process.exchange_point Route's exchange point.
      process.routing_key    Route's routing key.
      process.queue_name     Name of the queue to listen on.
      process.routes         streams,actor for each route {(stream_input_id, stream_output_id):actor} 
    Either the stream_id or both the exchange_point and routing_key need to be provided.
    '''    
    def on_start(self):
        TransformDataProcess.on_start(self)
        self.pubsub_management = PubsubManagementServiceProcessClient(process=self)
        self.stored_values = StoredValueManager(self.container)
        self.input_data_product_ids = self.CFG.get_safe('process.input_products', [])
        self.output_data_product_ids = self.CFG.get_safe('process.output_products', [])
        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent,callback=self._add_lookups, auto_delete=True)
        self.lookup_monitor.start()

    def on_quit(self):
        self.lookup_monitor.stop()
        TransformDataProcess.on_quit(self)

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin in self.input_data_product_ids + self.output_data_product_ids:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)


    @memoize_lru(100)
    def read_stream_def(self,stream_id):
        return self.pubsub_management.read_stream_definition(stream_id=stream_id)

    
    def recv_packet(self, msg, stream_route, stream_id):
        process_routes = self.CFG.get_safe('process.routes', {})
        for stream_in_id,routes in process_routes.iteritems():
            if stream_id == stream_in_id:
                for stream_out_id, actor in routes.iteritems():
                    if actor is None:
                        rdt_out = self._execute_transform(msg, (stream_in_id, stream_out_id))
                        self.publish(rdt_out.to_granule(), stream_out_id)
                    else:
                        outgoing = self._execute_actor(msg, actor, (stream_in_id, stream_out_id))
                        self.publish(outgoing, stream_out_id)

    def publish(self, msg, stream_out_id):
        publisher = getattr(self, stream_out_id)
        publisher.publish(msg)

    def _load_actor(self, actor):
        '''
        Returns callable execute method if it exists, otherwise it raises a BadRequest
        '''
        try:
            module = __import__(actor['module'], fromlist=[''])
        except ImportError:
            log.exception('Actor could not be loaded')
            raise
        try:
            cls = getattr(module, actor['class'])
        except AttributeError:
            log.exception('Module %s does not have class %s', repr(module), actor['class'])
            raise
        try:
            execute = getattr(cls,'execute')
        except AttributeError:
            log.exception('Actor class does not contain execute method')
            raise
        return execute

   
    def _execute_actor(self, msg, actor, streams):
        stream_in_id,stream_out_id = streams
        stream_def_out = self.read_stream_def(stream_out_id)
        params = self.CFG.get_safe('process.params', {})
        config = self.CFG.get_safe('process')
        #do the stuff with the actor
        params['stream_def'] = stream_def_out._id
        executor = self._load_actor(actor)
        try:
            rdt_out = executor(msg, None, config, params, None)
        except:
            log.exception('Error running actor for %s', self.id)
            raise
        return rdt_out

    def _merge_pdicts(self, pdict1, pdict2):
        incoming_pdict = ParameterDictionary.load(pdict1)
        outgoing_pdict = ParameterDictionary.load(pdict2)
        
        merged_pdict = ParameterDictionary()
        for k,v in incoming_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        for k,v in outgoing_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        return merged_pdict

    def _merge_rdt(self, stream_def_in, stream_def_out):
        incoming_pdict_dump = stream_def_in.parameter_dictionary
        outgoing_pdict_dump = stream_def_out.parameter_dictionary

        merged_pdict = self._merge_pdicts(incoming_pdict_dump, outgoing_pdict_dump)
        rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict)
        return rdt_temp


    def _get_lookup_value(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs

        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_values.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')

        return None

    def _execute_transform(self, msg, streams):
        stream_in_id,stream_out_id = streams
        stream_def_in = self.read_stream_def(stream_in_id)
        stream_def_out = self.read_stream_def(stream_out_id)

        rdt_temp = self._merge_rdt(stream_def_in, stream_def_out)
        
        rdt_in = RecordDictionaryTool.load_from_granule(msg)
        for field in rdt_temp.fields:
            if not isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType):
                try:
                    rdt_temp[field] = rdt_in[field]
                except KeyError:
                    pass

        rdt_temp.fetch_lookup_values()

        for lookup_field in rdt_temp.lookup_values():
            s = lookup_field
            stored_value = self._get_lookup_value(rdt_temp.context(s).lookup_value)
            if stored_value is not None:
                rdt_temp[s] = stored_value
        
        for field in rdt_temp.fields:
            if isinstance(rdt_temp._pdict.get_context(field).param_type, ParameterFunctionType):
                rdt_temp[field] = rdt_temp[field]

        
        rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id)

        for field in rdt_out.fields:
            rdt_out[field] = rdt_temp[field]
        
        return rdt_out 
class ScienceGranuleIngestionWorker(TransformStreamListener, BaseIngestionWorker):
    CACHE_LIMIT=CFG.get_safe('container.ingestion_cache',5)

    def __init__(self, *args,**kwargs):
        TransformStreamListener.__init__(self, *args, **kwargs)
        BaseIngestionWorker.__init__(self, *args, **kwargs)

        #--------------------------------------------------------------------------------
        # Ingestion Cache
        # - Datasets
        # - Coverage instances
        #--------------------------------------------------------------------------------
        self._datasets  = collections.OrderedDict()
        self._coverages = collections.OrderedDict()

        self._bad_coverages = {}

        self.time_stats = Accumulator(format='%3f')
        # unique ID to identify this worker in log msgs
        self._id = uuid.uuid1()



    def on_start(self): #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)
        
        self.queue_name = self.CFG.get_safe('process.queue_name',self.id)
        self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback)
        self.thread_lock = RLock()
        
        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.qc_enabled = self.CFG.get_safe('process.qc_enabled', True)
        self.ignore_gaps = self.CFG.get_safe('service.ingestion.ignore_gaps', True)
        if not self.ignore_gaps:
            log.warning("Gap handling is not supported in release 2")
        self.ignore_gaps = True
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        self.connection_id = ''
        self.connection_index = None
        
        self.start_listener()

    def on_quit(self): #pragma no cover
        self.event_publisher.close()
        self.qc_publisher.close()
        if self.subscriber_thread:
            self.stop_listener()
        for stream, coverage in self._coverages.iteritems():
            try:
                coverage.close(timeout=5)
            except:
                log.exception('Problems closing the coverage')
        self._coverages.clear()
        TransformStreamListener.on_quit(self)
        BaseIngestionWorker.on_quit(self)

    
    def start_listener(self):
        # We use a lock here to prevent possible race conditions from starting multiple listeners and coverage clobbering
        with self.thread_lock:
            self.subscriber_thread = self._process.thread_manager.spawn(self.subscriber.listen, thread_name='%s-subscriber' % self.id)

    def stop_listener(self):
        # Avoid race conditions with coverage operations (Don't start a listener at the same time as closing one)
        with self.thread_lock:
            self.subscriber.close()
            self.subscriber_thread.join(timeout=10)
            for stream, coverage in self._coverages.iteritems():
                try:
                    coverage.close(timeout=5)
                except:
                    log.exception('Problems closing the coverage')
            self._coverages.clear()
            self.subscriber_thread = None

    def pause(self):
        if self.subscriber_thread is not None:
            self.stop_listener()


    def resume(self):
        if self.subscriber_thread is None:
            self.start_listener()


    def _add_lookups(self, event, *args, **kwargs):
        if event.origin == self.input_product:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    def _new_dataset(self, stream_id):
        '''
        Adds a new dataset to the internal cache of the ingestion worker
        '''
        rr_client = self.container.resource_registry
        datasets, _ = rr_client.find_subjects(subject_type=RT.Dataset,predicate=PRED.hasStream,object=stream_id,id_only=True)
        if datasets:
            return datasets[0]
        return None

    def _get_data_products(self, dataset_id):
        rr_client = self.container.resource_registry
        data_products, _ = rr_client.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=False)
        return data_products


    def initialize_metadata(self, dataset_id, rdt):
        '''
        Initializes a metadata document in the object store. The document
        contains information about the bounds and extents of the dataset as
        well other metadata to improve performance.

        '''

        object_store = self.container.object_store
        key = dataset_id
        bounds = {}
        extents = {}
        last_values = {}
        rough_size = 0
        for k,v in rdt.iteritems():
            v = v[:].flatten()
            if v.dtype.char not in ('S', 'O', 'U', 'V'):
                bounds[k] = (np.min(v), np.max(v))
                last_values[k] = v[-1]
            extents[k] = len(rdt)
            rough_size += len(rdt) * 4

        doc = {'bounds':bounds, 'extents':extents, 'last_values':last_values, 'size': rough_size}
        doc = numpy_walk(doc)
        object_store.create_doc(doc, object_id=key)
        return 

    def update_metadata(self, dataset_id, rdt):
        '''
        Updates the metada document with the latest information available
        '''

        self.update_data_product_metadata(dataset_id, rdt)

        # Grab the document
        object_store = self.container.object_store
        key = dataset_id
        try:
            doc = object_store.read_doc(key)
        except NotFound:
            return self.initialize_metadata(dataset_id, rdt)
        # These are the fields we're interested in
        bounds = doc['bounds']
        extents = doc['extents']
        last_values = doc['last_values']
        rough_size = doc['size']
        for k,v in rdt.iteritems():
            if k not in bounds:
                continue

            v = v[:].flatten() # Get the numpy representation (dense array).
            if v.dtype.char not in ('S', 'O', 'U', 'V'):
                l_min = np.min(v)
                l_max = np.max(v)
                o_min, o_max = bounds[k]
                bounds[k] = (min(l_min, o_min), max(l_max, o_max))
                last_values[k] = v[-1]
            # Update the bounds
            # Increase the extents
            extents[k] = extents[k] + len(rdt)
            # How about the last value?

            rough_size += len(rdt) * 4
            doc['size'] = rough_size
        # Sanitize it
        doc = numpy_walk(doc)
        object_store.update_doc(doc)

    def update_data_product_metadata(self, dataset_id, rdt):
        data_products = self._get_data_products(dataset_id)
        for data_product in data_products:
            self.update_time(data_product, rdt[rdt.temporal_parameter][:])
            self.update_geo(data_product, rdt)
            self.container.resource_registry.update(data_product)

    def update_time(self, data_product, t):
        #TODO: Account for non NTP-based timestamps
        t_min = np.min(t)
        t_min -= 2208988800
        t_max = np.max(t)
        t_max -= 2208988800

        if not data_product.nominal_datetime.start_datetime:
            data_product.nominal_datetime.start_datetime = t_min
        data_product.nominal_datetime.end_datetime = t_max

    def update_geo(self, data_product, rdt):
        lat = None
        lon = None
        for p in rdt:
            if rdt._rd[p] is None:
                continue
            # TODO: Not an all encompassing list of acceptable names for lat and lon
            if p.lower() in ('lat', 'latitude', 'y_axis'):
                lat = np.asscalar(rdt[p][-1])
            elif p.lower() in ('lon', 'longitude', 'x_axis'):
                lon = np.asscalar(rdt[p][-1])
            if lat and lon:
                break

        if lat and lon:
            data_product.geospatial_bounds.geospatial_latitude_limit_north = lat
            data_product.geospatial_bounds.geospatial_latitude_limit_south = lat
            data_product.geospatial_bounds.geospatial_longitude_limit_east = lon
            data_product.geospatial_bounds.geospatial_longitude_limit_west = lon


    
    def get_dataset(self,stream_id):
        '''
        Memoization (LRU) of _new_dataset
        '''
        try:
            result = self._datasets.pop(stream_id)
        except KeyError:
            result = self._new_dataset(stream_id)
            if result is None:
                return None
            if len(self._datasets) >= self.CACHE_LIMIT:
                self._datasets.popitem(0)
        self._datasets[stream_id] = result
        return result

    def get_coverage(self, stream_id):
        '''
        Memoization (LRU) of _get_coverage
        '''
        try:
            result = self._coverages.pop(stream_id)
        except KeyError:
            dataset_id = self.get_dataset(stream_id)
            if dataset_id is None:
                return None
            result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
            if result is None:
                return None
            if len(self._coverages) >= self.CACHE_LIMIT:
                k, coverage = self._coverages.popitem(0)
                coverage.close(timeout=5)
        self._coverages[stream_id] = result
        return result

    def gap_coverage(self,stream_id):
        try:
            old_cov = self._coverages.pop(stream_id)
            dataset_id = self.get_dataset(stream_id)
            sdom, tdom = time_series_domain()
            new_cov = DatasetManagementService._create_simplex_coverage(dataset_id, old_cov.parameter_dictionary, sdom, tdom, old_cov._persistence_layer.inline_data_writes)
            old_cov.close()
            result = new_cov
        except KeyError:
            result = self.get_coverage(stream_id)
        self._coverages[stream_id] = result
        return result


    def dataset_changed(self, dataset_id, extents, window):
        self.event_publisher.publish_event(origin=dataset_id, author=self.id, extents=extents, window=window)

    def evaluate_qc(self, rdt, dataset_id):
        if self.qc_enabled:
            for field in rdt.fields:
                if not (field.endswith('glblrng_qc') or field.endswith('loclrng_qc')):
                    continue
                try:
                    values = rdt[field]
                    if values is not None:
                        if not all(values):
                            topology = np.where(values==0)
                            timestamps = rdt[rdt.temporal_parameter][topology[0]]
                            self.flag_qc_parameter(dataset_id, field, timestamps.tolist(), {})
                except:
                    continue
    def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration):
        data_product_ids, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct, id_only=True)
        for data_product_id in data_product_ids:
            description = 'Automated Quality Control Alerted on %s' % parameter
            self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration, description=description)

    def update_connection_index(self, connection_id, connection_index):
        self.connection_id = connection_id
        try:
            connection_index = int(connection_index)
            self.connection_index = connection_index
        except ValueError:
            pass

    def has_gap(self, connection_id, connection_index):
        if connection_id:
            if not self.connection_id:
                self.update_connection_index(connection_id, connection_index)
                return False
            else:
                if connection_id != self.connection_id:
                    return True
        if connection_index:
            if self.connection_index is None:
                self.update_connection_index(connection_id, connection_index)
                return False
            try:
                connection_index = int(connection_index)
                if connection_index != self.connection_index+1:
                    return True
            except ValueError:
                pass

        return False

    def splice_coverage(self, dataset_id, coverage):
        log.info('Splicing new coverage')
        DatasetManagementService._splice_coverage(dataset_id, coverage)

    @handle_stream_exception()
    def recv_packet(self, msg, stream_route, stream_id):
        ''' receive packet for ingestion '''
        log.trace('received granule for stream %s', stream_id)

        if msg == {}:
            log.error('Received empty message from stream: %s', stream_id)
            return
        # Message validation
        if not isinstance(msg, Granule):
            log.error('Ingestion received a message that is not a granule: %s', msg)
            return


        rdt = RecordDictionaryTool.load_from_granule(msg)
        if rdt is None:
            log.error('Invalid granule (no RDT) for stream %s', stream_id)
            return
        if not len(rdt):
            log.debug('Empty granule for stream %s', stream_id)
            return

        self.persist_or_timeout(stream_id, rdt)

    def persist_or_timeout(self, stream_id, rdt):
        """ retry writing coverage multiple times and eventually time out """
        done = False
        timeout = 2
        start = time.time()
        while not done:
            try:
                self.add_granule(stream_id, rdt)
                done = True
            except:
                log.exception('An issue with coverage, retrying after a bit')
                if (time.time() - start) > MAX_RETRY_TIME: # After an hour just give up
                    dataset_id = self.get_dataset(stream_id)
                    log.error("We're giving up, the coverage needs to be inspected %s", DatasetManagementService._get_coverage_path(dataset_id))
                    raise

                if stream_id in self._coverages:
                    log.info('Popping coverage for stream %s', stream_id)
                    self._coverages.pop(stream_id)

                gevent.sleep(timeout)
                if timeout > (60 * 5):
                    timeout = 60 * 5
                else:
                    timeout *= 2


    def expand_coverage(self, coverage, elements, stream_id):
        try:
            coverage.insert_timesteps(elements, oob=False)
        except IOError as e:
            log.error("Couldn't insert time steps for coverage: %s",
                      coverage.persistence_dir, exc_info=True)
            try:
                coverage.close()
            finally:
                self._bad_coverages[stream_id] = 1
                raise CorruptionError(e.message)
    
    def get_stored_values(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs
        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_value_manager.read_value(key)
                if lookup_value in document:
                    return document[lookup_value] 
            except NotFound:
                log.warning('Specified lookup document does not exist')
        return None


    def fill_lookup_values(self, rdt):
        rdt.fetch_lookup_values()
        for field in rdt.lookup_values():
            value = self.get_stored_values(rdt.context(field).lookup_value)
            if value:
                rdt[field] = value

    def insert_sparse_values(self, coverage, rdt, stream_id):

        self.fill_lookup_values(rdt)
        for field in rdt.fields:
            if rdt._rd[field] is None:
                continue
            if not isinstance(rdt.context(field).param_type, SparseConstantType):
                # We only set sparse values before insert
                continue 
            value = rdt[field]
            try:
                coverage.set_parameter_values(param_name=field, value=value)
            except ValueError as e:
                if "'lower_bound' cannot be >= 'upper_bound'" in e.message:
                    continue
                else:
                    raise
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)

    def insert_values(self, coverage, rdt, stream_id):
        elements = len(rdt)

        start_index = coverage.num_timesteps - elements

        for k,v in rdt.iteritems():
            if isinstance(v, SparseConstantValue):
                continue
            slice_ = slice(start_index, None)
            try:
                coverage.set_parameter_values(param_name=k, tdoa=slice_, value=v)
            except IOError as e:
                log.error("Couldn't insert values for coverage: %s",
                          coverage.persistence_dir, exc_info=True)
                try:
                    coverage.close()
                finally:
                    self._bad_coverages[stream_id] = 1
                    raise CorruptionError(e.message)
            except IndexError as e:
                log.error("Value set: %s", v[:])
                data_products, _ = self.container.resource_registry.find_subjects(object=stream_id, predicate=PRED.hasStream, subject_type=RT.DataProduct)
                for data_product in data_products:
                    log.exception("Index exception with %s, trying to insert %s into coverage with shape %s", 
                                  data_product.name,
                                  k,
                                  v.shape)

    
        if 'ingestion_timestamp' in coverage.list_parameters():
            t_now = time.time()
            ntp_time = TimeUtils.ts_to_units(coverage.get_parameter_context('ingestion_timestamp').uom, t_now)
            coverage.set_parameter_values(param_name='ingestion_timestamp', tdoa=slice_, value=ntp_time)
    
    def add_granule(self,stream_id, rdt):
        ''' Appends the granule's data to the coverage and persists it. '''
        debugging = log.isEnabledFor(DEBUG)
        timer = Timer() if debugging else None
        if stream_id in self._bad_coverages:
            log.info('Message attempting to be inserted into bad coverage: %s',
                     DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            
        #--------------------------------------------------------------------------------
        # Gap Analysis
        #--------------------------------------------------------------------------------
        if not self.ignore_gaps:
            gap_found = self.has_gap(rdt.connection_id, rdt.connection_index)
            if gap_found:
                log.warning('Gap Found!   New connection: (%s,%s)\tOld Connection: (%s,%s)', rdt.connection_id, rdt.connection_index, self.connection_id, self.connection_index)
                self.gap_coverage(stream_id)



        #--------------------------------------------------------------------------------
        # Coverage determiniation and appending
        #--------------------------------------------------------------------------------
        dataset_id = self.get_dataset(stream_id)
        if not dataset_id:
            log.error('No dataset could be determined on this stream: %s', stream_id)
            return

        try:
            coverage = self.get_coverage(stream_id)
        except IOError as e:
            log.error("Couldn't open coverage: %s",
                      DatasetManagementService._get_coverage_path(self.get_dataset(stream_id)))
            raise CorruptionError(e.message)

        if debugging:
            path = DatasetManagementService._get_coverage_path(dataset_id)
            log.debug('%s: add_granule stream %s dataset %s coverage %r file %s',
                      self._id, stream_id, dataset_id, coverage, path)

        if not coverage:
            log.error('Could not persist coverage from granule, coverage is None')
            return
        #--------------------------------------------------------------------------------
        # Actual persistence
        #--------------------------------------------------------------------------------

        elements = len(rdt)
        if rdt[rdt.temporal_parameter] is None:
            elements = 0 

        self.insert_sparse_values(coverage,rdt,stream_id)
        
        if debugging:
            timer.complete_step('checks') # lightweight ops, should be zero
        
        self.expand_coverage(coverage, elements, stream_id)
        
        if debugging:
            timer.complete_step('insert')

        self.insert_values(coverage, rdt, stream_id)
        
        if debugging:
            timer.complete_step('keys')
        
        DatasetManagementService._save_coverage(coverage)
        
        if debugging:
            timer.complete_step('save')
        
        start_index = coverage.num_timesteps - elements

        if not self.ignore_gaps and gap_found:
            self.splice_coverage(dataset_id, coverage)

        self.evaluate_qc(rdt, dataset_id)
        
        if debugging:
            timer.complete_step('notify')
            self._add_timing_stats(timer)

        self.update_connection_index(rdt.connection_id, rdt.connection_index)

        self.update_metadata(dataset_id, rdt)
        self.dataset_changed(dataset_id,coverage.num_timesteps,(start_index,start_index+elements))

    def _add_timing_stats(self, timer):
        """ add stats from latest coverage operation to Accumulator and periodically log results """
        self.time_stats.add(timer)
        if self.time_stats.get_count() % REPORT_FREQUENCY>0:
            return

        if log.isEnabledFor(TRACE):
            # report per step
            for step in 'checks', 'insert', 'keys', 'save', 'notify':
                log.debug('%s step %s times: %s', self._id, step, self.time_stats.to_string(step))
        # report totals
        log.debug('%s total times: %s', self._id, self.time_stats)
예제 #18
0
class TransformPrime(TransformDataProcess):
    binding = ['output']
    '''
    Transforms which have an incoming stream and an outgoing stream.

    Parameters:
      process.stream_id      Outgoing stream identifier.
      process.exchange_point Route's exchange point.
      process.routing_key    Route's routing key.
      process.queue_name     Name of the queue to listen on.
      process.routes         streams,actor for each route {(stream_input_id, stream_output_id):actor} 
    Either the stream_id or both the exchange_point and routing_key need to be provided.
    '''
    def on_start(self):
        TransformDataProcess.on_start(self)
        self.pubsub_management = PubsubManagementServiceProcessClient(
            process=self)
        self.stored_values = StoredValueManager(self.container)
        self.input_data_product_ids = self.CFG.get_safe(
            'process.input_products', [])
        self.output_data_product_ids = self.CFG.get_safe(
            'process.output_products', [])
        self.lookup_docs = self.CFG.get_safe('process.lookup_docs', [])
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(
            event_type=OT.ExternalReferencesUpdatedEvent,
            callback=self._add_lookups,
            auto_delete=True)
        self.lookup_monitor.start()

    def on_quit(self):
        self.lookup_monitor.stop()
        TransformDataProcess.on_quit(self)

    def _add_lookups(self, event, *args, **kwargs):
        if event.origin in self.input_data_product_ids + self.output_data_product_ids:
            if isinstance(event.reference_keys, list):
                self.new_lookups.put(event.reference_keys)

    @memoize_lru(100)
    def read_stream_def(self, stream_id):
        return self.pubsub_management.read_stream_definition(
            stream_id=stream_id)

    def recv_packet(self, msg, stream_route, stream_id):
        process_routes = self.CFG.get_safe('process.routes', {})
        for stream_in_id, routes in process_routes.iteritems():
            if stream_id == stream_in_id:
                for stream_out_id, actor in routes.iteritems():
                    if actor is None:
                        rdt_out = self._execute_transform(
                            msg, (stream_in_id, stream_out_id))
                        self.publish(rdt_out.to_granule(), stream_out_id)
                    else:
                        outgoing = self._execute_actor(
                            msg, actor, (stream_in_id, stream_out_id))
                        self.publish(outgoing, stream_out_id)

    def publish(self, msg, stream_out_id):
        publisher = getattr(self, stream_out_id)
        publisher.publish(msg)

    def _load_actor(self, actor):
        '''
        Returns callable execute method if it exists, otherwise it raises a BadRequest
        '''
        try:
            module = __import__(actor['module'], fromlist=[''])
        except ImportError:
            log.exception('Actor could not be loaded')
            raise
        try:
            cls = getattr(module, actor['class'])
        except AttributeError:
            log.exception('Module %s does not have class %s', repr(module),
                          actor['class'])
            raise
        try:
            execute = getattr(cls, 'execute')
        except AttributeError:
            log.exception('Actor class does not contain execute method')
            raise
        return execute

    def _execute_actor(self, msg, actor, streams):
        stream_in_id, stream_out_id = streams
        stream_def_out = self.read_stream_def(stream_out_id)
        params = self.CFG.get_safe('process.params', {})
        config = self.CFG.get_safe('process')
        #do the stuff with the actor
        params['stream_def'] = stream_def_out._id
        executor = self._load_actor(actor)
        try:
            rdt_out = executor(msg, None, config, params, None)
        except:
            log.exception('Error running actor for %s', self.id)
            raise
        return rdt_out

    def _merge_pdicts(self, pdict1, pdict2):
        incoming_pdict = ParameterDictionary.load(pdict1)
        outgoing_pdict = ParameterDictionary.load(pdict2)

        merged_pdict = ParameterDictionary()
        for k, v in incoming_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        for k, v in outgoing_pdict.iteritems():
            ordinal, v = v
            if k not in merged_pdict:
                merged_pdict.add_context(v)
        return merged_pdict

    def _merge_rdt(self, stream_def_in, stream_def_out):
        incoming_pdict_dump = stream_def_in.parameter_dictionary
        outgoing_pdict_dump = stream_def_out.parameter_dictionary

        merged_pdict = self._merge_pdicts(incoming_pdict_dump,
                                          outgoing_pdict_dump)
        rdt_temp = RecordDictionaryTool(param_dictionary=merged_pdict)
        return rdt_temp

    def _get_lookup_value(self, lookup_value):
        if not self.new_lookups.empty():
            new_values = self.new_lookups.get()
            self.lookup_docs = new_values + self.lookup_docs

        lookup_value_document_keys = self.lookup_docs
        for key in lookup_value_document_keys:
            try:
                document = self.stored_values.read_value(key)
                if lookup_value in document:
                    return document[lookup_value]
            except NotFound:
                log.warning('Specified lookup document does not exist')

        return None

    def _execute_transform(self, msg, streams):
        stream_in_id, stream_out_id = streams
        stream_def_in = self.read_stream_def(stream_in_id)
        stream_def_out = self.read_stream_def(stream_out_id)

        rdt_temp = self._merge_rdt(stream_def_in, stream_def_out)

        rdt_in = RecordDictionaryTool.load_from_granule(msg)
        for field in rdt_temp.fields:
            if not isinstance(
                    rdt_temp._pdict.get_context(field).param_type,
                    ParameterFunctionType):
                try:
                    rdt_temp[field] = rdt_in[field]
                except KeyError:
                    pass

        rdt_temp.fetch_lookup_values()

        for lookup_field in rdt_temp.lookup_values():
            s = lookup_field
            stored_value = self._get_lookup_value(
                rdt_temp.context(s).lookup_value)
            if stored_value is not None:
                rdt_temp[s] = stored_value

        for field in rdt_temp.fields:
            if isinstance(
                    rdt_temp._pdict.get_context(field).param_type,
                    ParameterFunctionType):
                rdt_temp[field] = rdt_temp[field]

        rdt_out = RecordDictionaryTool(stream_definition_id=stream_def_out._id)

        for field in rdt_out.fields:
            rdt_out[field] = rdt_temp[field]

        return rdt_out