def test_ingestion_pause(self): ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() ingestion_config_id = self.get_ingestion_config() self.start_ingestion(ctd_stream_id, dataset_id) self.addCleanup(self.stop_ingestion, ctd_stream_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) publisher = StandaloneStreamPublisher(ctd_stream_id, route) monitor = DatasetMonitor(dataset_id) self.addCleanup(monitor.stop) publisher.publish(rdt.to_granule()) self.assertTrue(monitor.wait()) granule = self.data_retriever.retrieve(dataset_id) self.ingestion_management.pause_data_stream(ctd_stream_id, ingestion_config_id) monitor.event.clear() rdt['time'] = np.arange(10,20) publisher.publish(rdt.to_granule()) self.assertFalse(monitor.event.wait(1)) self.ingestion_management.resume_data_stream(ctd_stream_id, ingestion_config_id) self.assertTrue(monitor.wait()) granule = self.data_retriever.retrieve(dataset_id) rdt2 = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_almost_equal(rdt2['time'], np.arange(20))
def test_qc_events(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_qc_pdict() stream_def_id = self.pubsub_management.create_stream_definition('qc stream def', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) stream_id, route = self.pubsub_management.create_stream('qc stream', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream, stream_id) ingestion_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) config = DotDict() self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config) self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id) publisher = StandaloneStreamPublisher(stream_id, route) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) rdt['temp'] = np.arange(10) * 3 verified = Event() def verification(event, *args, **kwargs): self.assertEquals(event.qc_parameter, 'temp_qc') self.assertEquals(event.temporal_value, 7) verified.set() es = EventSubscriber(event_type=OT.ParameterQCEvent, origin=dataset_id, callback=verification, auto_delete=True) es.start() self.addCleanup(es.stop) publisher.publish(rdt.to_granule()) self.assertTrue(verified.wait(10))
def create_test_granules(self, buffer_data=False): """ Generate test granules from particles. If buffer data is set to true then try to buffer data into a granule. If the particle has the new sequence flag set then a new granule will be generated. This method emulates the agent_stream_publisher module. :return: list of granules generated. """ base_timestamp = 3583861263.0 connection_index = 0 particles = [] particles.append( self.get_particle(base_timestamp, 10.5914, 161.06, 4.1870, 2693.0)) particles.append( self.get_particle(base_timestamp + 1, 10.5915, 161.07, 4.1871, 2693.1)) particles.append( self.get_particle(base_timestamp + 2, 10.5916, 161.08, 4.1872, 2693.2)) particles.append( self.get_particle(base_timestamp + 3, 10.5917, 161.09, 4.1873, 2693.3, True)) particles.append( self.get_particle(base_timestamp + 4, 10.5918, 161.10, 4.1874, 2693.4)) data_groups = [] result_granules = [] data_groups_index = 0 for particle in particles: # If we need a new connection then start a new group, but only if we have found # something in the current group if (particle.get('new_sequence', False) or buffer_data == False) and \ (len(data_groups) > 0 and len(data_groups[data_groups_index]) > 0): data_groups_index += 1 if len(data_groups) <= data_groups_index: data_groups.append([]) data_groups[data_groups_index].append(particle) log.debug("Granules to create: %s", len(data_groups)) for data in data_groups: connection_id = uuid.uuid4() connection_index += 1 rdt = RecordDictionaryTool(param_dictionary=self.get_param_dict()) rdt = populate_rdt(rdt, data) g = rdt.to_granule(data_producer_id='agent_res_id', connection_id=connection_id.hex, connection_index=str(connection_index)) result_granules.append(g) return result_granules
def create_granule(self, stream_name, param_dict_name, particle_list): pd_id = self.dataset_management.read_parameter_dictionary_by_name(param_dict_name, id_only=True) stream_def_id = self.pubsub_client.create_stream_definition(name=stream_name, parameter_dictionary_id=pd_id) stream_def = self.pubsub_client.read_stream_definition(stream_def_id) rdt = RecordDictionaryTool(stream_definition=stream_def) rdt = populate_rdt(rdt, particle_list) log.trace("RDT: %s", str(rdt)) g = rdt.to_granule(data_producer_id='fake_agent_id') return g
def publish_hifi(self,stream_id,stream_route,offset=0): pub = StandaloneStreamPublisher(stream_id, stream_route) stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id) stream_def_id = stream_def._id rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) + (offset * 10) rdt['temp'] = np.arange(10) + (offset * 10) pub.publish(rdt.to_granule())
def create_granule(self, stream_name, param_dict_name, particle_list): pd_id = self.dataset_management.read_parameter_dictionary_by_name(param_dict_name, id_only=True) stream_def_id = self.pubsub_client.create_stream_definition(name=stream_name, parameter_dictionary_id=pd_id) stream_def = self.pubsub_client.read_stream_definition(stream_def_id) rdt = RecordDictionaryTool(stream_definition=stream_def) rdt = populate_rdt(rdt, particle_list) log.trace("RDT: %s", str(rdt)) g = rdt.to_granule(data_producer_id='fake_agent_id') return g
def test_retrieve_and_transform(self): # Stream definition for the CTD data pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True) stream_def_id = self.pubsub_management.create_stream_definition('ctd data', parameter_dictionary_id=pdict_id) ctd_stream_id, route = self.pubsub_management.create_stream('ctd stream', 'xp1', stream_definition_id=stream_def_id) # Stream definition for the salinity data salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True) sal_stream_def_id = self.pubsub_management.create_stream_definition('sal data', parameter_dictionary_id=salinity_pdict_id) ingest_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) #-------------------------------------------------------------------------------- # Again with this ridiculous problem #-------------------------------------------------------------------------------- self.get_datastore(dataset_id) self.ingestion_management.persist_data_stream(stream_id=ctd_stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) rdt['temp'] = np.random.randn(10) * 10 + 30 rdt['conductivity'] = np.random.randn(10) * 2 + 10 publisher = StandaloneStreamPublisher(ctd_stream_id, route) publisher.publish(rdt.to_granule()) rdt['time'] = np.arange(10,20) publisher.publish(rdt.to_granule()) self.wait_until_we_have_enough_granules(dataset_id, 2) granule = self.data_retriever.retrieve(dataset_id, None, None, 'ion.processes.data.transforms.ctd.ctd_L2_salinity', 'CTDL2SalinityTransformAlgorithm', kwargs=dict(params=sal_stream_def_id)) rdt = RecordDictionaryTool.load_from_granule(granule) for i in rdt['salinity']: self.assertNotEquals(i,0)
def _publish_stream_buffer(self, stream_name): """ ['quality_flag', 'preferred_timestamp', 'port_timestamp', 'lon', 'raw', 'internal_timestamp', 'time', 'lat', 'driver_timestamp'] ['quality_flag', 'preferred_timestamp', 'temp', 'density', 'port_timestamp', 'lon', 'salinity', 'pressure', 'internal_timestamp', 'time', 'lat', 'driver_timestamp', 'conductivit {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "raw", "values": [{"binary": true, "value": "MzIuMzkxOSw5MS4wOTUxMiwgNzg0Ljg1MywgICA2LjE5OTQsIDE1MDUuMTc5LCAxOSBEZWMgMjAxMiwgMDA6NTI6Mjc=", "value_id": "raw"}]}', 'time': 1355878347.744123} {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "parsed", "values": [{"value": 32.3919, "value_id": "temp"}, {"value": 91.09512, "value_id": "conductivity"}, {"value": 784.853, "value_id": "pressure"}]}', 'time': 1355878347.744127} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'port_timestamp': [None], 'lon': [None], 'raw': ['-4.9733,16.02390, 539.527, 34.2719, 1506.862, 19 Dec 2012, 01:03:07'], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117]} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'temp': [-4.9733], 'density': [None], 'port_timestamp': [None], 'lon': [None], 'salinity': [None], 'pressure': [539.527], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117], 'conductivity': [16.0239]} """ try: buf_len = len(self._stream_buffers[stream_name]) if buf_len == 0: return stream_def = self._stream_defs[stream_name] if isinstance(stream_def, str): rdt = RecordDictionaryTool(stream_definition_id=stream_def) else: rdt = RecordDictionaryTool(stream_definition=stream_def) publisher = self._publishers[stream_name] vals = [] for x in xrange(buf_len): vals.append(self._stream_buffers[stream_name].pop()) rdt = populate_rdt(rdt, vals) log.info('Outgoing granule: %s', ['%s: %s' % (k, v) for k, v in rdt.iteritems()]) g = rdt.to_granule(data_producer_id=self._agent.resource_id, connection_id=self._connection_ID.hex, connection_index=str( self._connection_index[stream_name])) publisher.publish(g) log.info( 'Instrument agent %s published data granule on stream %s.', self._agent._proc_name, stream_name) log.info('Connection id: %s, connection index: %i.', self._connection_ID.hex, self._connection_index[stream_name]) self._connection_index[stream_name] += 1 except: log.exception( 'Instrument agent %s could not publish data on stream %s.', self._agent._proc_name, stream_name)
def publish_loop(self): t_i = 0 while not self.finished.is_set(): rdt = RecordDictionaryTool(stream_definition_id=self.stream_def._id) rdt['time'] = numpy.arange(10) + t_i*10 rdt['temp'] = numpy.random.random_sample(10)*(30-0)+0 self.publish(rdt.to_granule()) gevent.sleep(self.interval) t_i += 1
def test_retrieve_and_transform(self): # Make a simple dataset and start ingestion, pretty standard stuff. ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() self.start_ingestion(ctd_stream_id, dataset_id) # Stream definition for the salinity data salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name( "ctd_parsed_param_dict", id_only=True ) sal_stream_def_id = self.pubsub_management.create_stream_definition( "sal data", parameter_dictionary_id=salinity_pdict_id ) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt["time"] = np.arange(10) rdt["temp"] = np.random.randn(10) * 10 + 30 rdt["conductivity"] = np.random.randn(10) * 2 + 10 rdt["pressure"] = np.random.randn(10) * 1 + 12 publisher = StandaloneStreamPublisher(ctd_stream_id, route) publisher.publish(rdt.to_granule()) rdt["time"] = np.arange(10, 20) publisher.publish(rdt.to_granule()) self.wait_until_we_have_enough_granules(dataset_id, 20) granule = self.data_retriever.retrieve( dataset_id, None, None, "ion.processes.data.transforms.ctd.ctd_L2_salinity", "CTDL2SalinityTransformAlgorithm", kwargs=dict(params=sal_stream_def_id), ) rdt = RecordDictionaryTool.load_from_granule(granule) for i in rdt["salinity"]: self.assertNotEquals(i, 0) self.streams.append(ctd_stream_id) self.stop_ingestion(ctd_stream_id)
def publish_loop(self): #@todo - add lots of comments in here while not self.finished.is_set(): length = 10 #Explicitly make these numpy arrays... c = numpy.array( [random.uniform(0.0, 75.0) for i in xrange(length)]) t = numpy.array( [random.uniform(-1.7, 21.0) for i in xrange(length)]) p = numpy.array( [random.lognormvariate(1, 2) for i in xrange(length)]) lat = numpy.array( [random.uniform(-90.0, 90.0) for i in xrange(length)]) lon = numpy.array( [random.uniform(0.0, 360.0) for i in xrange(length)]) tvar = numpy.array( [self.last_time + i for i in xrange(1, length + 1)]) self.last_time = max(tvar) parameter_dictionary = self._create_parameter() rdt = RecordDictionaryTool(param_dictionary=parameter_dictionary) # This is an example of using groups it is not a normative statement about how to use groups rdt['temp'] = t rdt['conductivity'] = c rdt['pressure'] = p #add a value sequence of raw bytes - not sure the type below is correct? with open('/dev/urandom', 'r') as rand: rdt['raw_fixed'] = numpy.array( [rand.read(32) for i in xrange(length)], dtype='a32') #add a value sequence of raw bytes - not sure the type below is correct? with open('/dev/urandom', 'r') as rand: rdt['raw_blob'] = numpy.array( [rand.read(random.randint(1, 40)) for i in xrange(length)], dtype=object) rdt['time'] = tvar rdt['lat'] = lat rdt['lon'] = lon g = rdt.to_granule() log.info('Sending %d values!' % length) if isinstance(g, Granule): self.publish(g) gevent.sleep(self.interval)
def _publish_stream_buffer(self, stream_name): """ ['quality_flag', 'preferred_timestamp', 'port_timestamp', 'lon', 'raw', 'internal_timestamp', 'time', 'lat', 'driver_timestamp'] ['quality_flag', 'preferred_timestamp', 'temp', 'density', 'port_timestamp', 'lon', 'salinity', 'pressure', 'internal_timestamp', 'time', 'lat', 'driver_timestamp', 'conductivit {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "raw", "values": [{"binary": true, "value": "MzIuMzkxOSw5MS4wOTUxMiwgNzg0Ljg1MywgICA2LjE5OTQsIDE1MDUuMTc5LCAxOSBEZWMgMjAxMiwgMDA6NTI6Mjc=", "value_id": "raw"}]}', 'time': 1355878347.744123} {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "parsed", "values": [{"value": 32.3919, "value_id": "temp"}, {"value": 91.09512, "value_id": "conductivity"}, {"value": 784.853, "value_id": "pressure"}]}', 'time': 1355878347.744127} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'port_timestamp': [None], 'lon': [None], 'raw': ['-4.9733,16.02390, 539.527, 34.2719, 1506.862, 19 Dec 2012, 01:03:07'], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117]} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'temp': [-4.9733], 'density': [None], 'port_timestamp': [None], 'lon': [None], 'salinity': [None], 'pressure': [539.527], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117], 'conductivity': [16.0239]} """ try: buf_len = len(self._stream_buffers[stream_name]) if buf_len == 0: return stream_def = self._stream_defs[stream_name] if isinstance(stream_def, str): rdt = RecordDictionaryTool(stream_definition_id=stream_def) else: rdt = RecordDictionaryTool(stream_definition=stream_def) publisher = self._publishers[stream_name] vals = [] for x in xrange(buf_len): vals.append(self._stream_buffers[stream_name].pop()) rdt = populate_rdt(rdt, vals) #log.info('Outgoing granule: %s', #['%s: %s'%(k,v) for k,v in rdt.iteritems()]) #log.info('Outgoing granule preferred timestamp: %s' % rdt['preferred_timestamp'][0]) #log.info('Outgoing granule destined for stream: %s', stream_name) g = rdt.to_granule(data_producer_id=self._agent.resource_id, connection_id=self._connection_ID.hex, connection_index=str(self._connection_index[stream_name])) publisher.publish(g) #log.info('Instrument agent %s published data granule on stream %s.', #self._agent._proc_name, stream_name) #log.info('Connection id: %s, connection index: %i.', #self._connection_ID.hex, self._connection_index[stream_name]) self._connection_index[stream_name] += 1 except: log.exception('Instrument agent %s could not publish data on stream %s.', self._agent._proc_name, stream_name)
def test_ingestion_gap_analysis(self): stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset() self.start_ingestion(stream_id, dataset_id) self.addCleanup(self.stop_ingestion, stream_id) connection1 = uuid4().hex connection2 = uuid4().hex rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [0] rdt['temp'] = [0] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index='0')) rdt['time'] = [1] rdt['temp'] = [1] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index=1)) rdt['time'] = [2] rdt['temp'] = [2] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index='3')) # Gap, missed message rdt['time'] = [3] rdt['temp'] = [3] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index='3')) # Gap, new connection rdt['time'] = [4] rdt['temp'] = [4] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index='4')) rdt['time'] = [5] rdt['temp'] = [5] self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index=5)) granule = self.data_retriever.retrieve(dataset_id) rdt = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt['time'], np.arange(6)) np.testing.assert_array_equal(rdt['temp'], np.arange(6)) return dataset_id
def publish_hifi(self,stream_id,stream_route,offset=0): ''' Publish deterministic data ''' pub = StandaloneStreamPublisher(stream_id, stream_route) stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id) stream_def_id = stream_def._id rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) + (offset * 10) rdt['temp'] = np.arange(10) + (offset * 10) pub.publish(rdt.to_granule())
def test_retrieve_and_transform(self): # Make a simple dataset and start ingestion, pretty standard stuff. ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset( ) self.start_ingestion(ctd_stream_id, dataset_id) # Stream definition for the salinity data salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name( 'ctd_parsed_param_dict', id_only=True) sal_stream_def_id = self.pubsub_management.create_stream_definition( 'sal data', parameter_dictionary_id=salinity_pdict_id) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(10) rdt['temp'] = np.random.randn(10) * 10 + 30 rdt['conductivity'] = np.random.randn(10) * 2 + 10 rdt['pressure'] = np.random.randn(10) * 1 + 12 publisher = StandaloneStreamPublisher(ctd_stream_id, route) publisher.publish(rdt.to_granule()) rdt['time'] = np.arange(10, 20) publisher.publish(rdt.to_granule()) self.wait_until_we_have_enough_granules(dataset_id, 20) granule = self.data_retriever.retrieve( dataset_id, None, None, 'ion.processes.data.transforms.ctd.ctd_L2_salinity', 'CTDL2SalinityTransformAlgorithm', kwargs=dict(params=sal_stream_def_id)) rdt = RecordDictionaryTool.load_from_granule(granule) for i in rdt['salinity']: self.assertNotEquals(i, 0) self.streams.append(ctd_stream_id) self.stop_ingestion(ctd_stream_id)
def publish_loop(self): t_i = 0 while not self.finished.is_set(): rdt = RecordDictionaryTool(stream_definition_id=self.stream_def._id) rdt['time'] = numpy.arange(10) + t_i*10 rdt['temp'] = numpy.random.random(10) * 10 rdt['lat'] = numpy.array([0] * 10) rdt['lon'] = numpy.array([0] * 10) rdt['conductivity'] = numpy.random.random(10) * 10 rdt['binary'] = numpy.array(['hi'] * 10, dtype='object') self.publish(rdt.to_granule()) gevent.sleep(self.interval) t_i += 1
def publish_loop(self): #@todo - add lots of comments in here while not self.finished.is_set(): length = 10 #Explicitly make these numpy arrays... c = numpy.array([random.uniform(0.0,75.0) for i in xrange(length)]) t = numpy.array([random.uniform(-1.7, 21.0) for i in xrange(length)]) p = numpy.array([random.lognormvariate(1,2) for i in xrange(length)]) lat = numpy.array([random.uniform(-90.0, 90.0) for i in xrange(length)]) lon = numpy.array([random.uniform(0.0, 360.0) for i in xrange(length)]) tvar = numpy.array([self.last_time + i for i in xrange(1,length+1)]) self.last_time = max(tvar) parameter_dictionary = self._create_parameter() rdt = RecordDictionaryTool(param_dictionary=parameter_dictionary) # This is an example of using groups it is not a normative statement about how to use groups rdt['temp'] = t rdt['conductivity'] = c rdt['pressure'] = p #add a value sequence of raw bytes - not sure the type below is correct? with open('/dev/urandom','r') as rand: rdt['raw_fixed'] = numpy.array([rand.read(32) for i in xrange(length)], dtype='a32') #add a value sequence of raw bytes - not sure the type below is correct? with open('/dev/urandom','r') as rand: rdt['raw_blob'] = numpy.array([rand.read(random.randint(1,40)) for i in xrange(length)], dtype=object) rdt['time'] = tvar rdt['lat'] = lat rdt['lon'] = lon g = rdt.to_granule() log.info('Sending %d values!' % length) if isinstance(g,Granule): self.publish(g) gevent.sleep(self.interval)
def create_test_granules(self, buffer_data=False): """ Generate test granules from particles. If buffer data is set to true then try to buffer data into a granule. If the particle has the new sequence flag set then a new granule will be generated. This method emulates the agent_stream_publisher module. :return: list of granules generated. """ base_timestamp = 3583861263.0 connection_index = 0 particles = [] particles.append(self.get_particle(base_timestamp, 10.5914, 161.06, 4.1870, 2693.0)) particles.append(self.get_particle(base_timestamp+1, 10.5915, 161.07, 4.1871, 2693.1)) particles.append(self.get_particle(base_timestamp+2, 10.5916, 161.08, 4.1872, 2693.2)) particles.append(self.get_particle(base_timestamp+3, 10.5917, 161.09, 4.1873, 2693.3, True)) particles.append(self.get_particle(base_timestamp+4, 10.5918, 161.10, 4.1874, 2693.4)) data_groups = [] result_granules = [] data_groups_index = 0 for particle in particles: # If we need a new connection then start a new group, but only if we have found # something in the current group if (particle.get('new_sequence', False) or buffer_data == False) and \ (len(data_groups) > 0 and len(data_groups[data_groups_index]) > 0): data_groups_index += 1 if len(data_groups) <= data_groups_index: data_groups.append([]) data_groups[data_groups_index].append(particle) log.debug("Granules to create: %s", len(data_groups)) for data in data_groups: connection_id = uuid.uuid4() connection_index += 1 rdt = RecordDictionaryTool(param_dictionary=self.get_param_dict()) rdt = populate_rdt(rdt, data) g = rdt.to_granule(data_producer_id='agent_res_id', connection_id=connection_id.hex, connection_index=str(connection_index)) result_granules.append(g) return result_granules
def test_ingestion_gap_analysis(self): stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset( ) self.start_ingestion(stream_id, dataset_id) self.addCleanup(self.stop_ingestion, stream_id) connection1 = uuid4().hex connection2 = uuid4().hex rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = [0] rdt['temp'] = [0] self.publish_and_wait( dataset_id, rdt.to_granule(connection_id=connection1, connection_index='0')) rdt['time'] = [1] rdt['temp'] = [1] self.publish_and_wait( dataset_id, rdt.to_granule(connection_id=connection1, connection_index='1')) rdt['time'] = [2] rdt['temp'] = [2] self.publish_and_wait(dataset_id, rdt.to_granule( connection_id=connection1, connection_index='3')) # Gap, missed message rdt['time'] = [3] rdt['temp'] = [3] self.publish_and_wait(dataset_id, rdt.to_granule( connection_id=connection2, connection_index='3')) # Gap, new connection rdt['time'] = [4] rdt['temp'] = [4] self.publish_and_wait( dataset_id, rdt.to_granule(connection_id=connection2, connection_index='4')) rdt['time'] = [5] rdt['temp'] = [5] self.publish_and_wait( dataset_id, rdt.to_granule(connection_id=connection2, connection_index='5')) granule = self.data_retriever.retrieve(dataset_id) rdt = RecordDictionaryTool.load_from_granule(granule) np.testing.assert_array_equal(rdt['time'], np.arange(6)) np.testing.assert_array_equal(rdt['temp'], np.arange(6)) return dataset_id
def test_lookup_values_ingest_replay(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsub_management.create_stream_definition( 'lookups', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) stream_id, route = self.pubsub_management.create_stream( 'example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream, stream_id) ingestion_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) config = DotDict() config.process.lookup_docs = ['test1', 'test2'] self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config) self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id) stored_value_manager = StoredValueManager(self.container) stored_value_manager.stored_value_cas('test1', { 'offset_a': 10.0, 'offset_b': 13.1 }) publisher = StandaloneStreamPublisher(stream_id, route) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.] * 20)) np.testing.assert_array_equal( rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20)) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20, 40) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor.event.clear() stored_value_manager.stored_value_cas('test1', {'offset_a': 20.0}) stored_value_manager.stored_value_cas('coefficient_document', {'offset_b': 10.0}) gevent.sleep(2) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.] * 20 + [40.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))
def _publish_stream_buffer(self, stream_name): """ ['quality_flag', 'preferred_timestamp', 'port_timestamp', 'lon', 'raw', 'internal_timestamp', 'time', 'lat', 'driver_timestamp'] ['quality_flag', 'preferred_timestamp', 'temp', 'density', 'port_timestamp', 'lon', 'salinity', 'pressure', 'internal_timestamp', 'time', 'lat', 'driver_timestamp', 'conductivit {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "raw", "values": [{"binary": true, "value": "MzIuMzkxOSw5MS4wOTUxMiwgNzg0Ljg1MywgICA2LjE5OTQsIDE1MDUuMTc5LCAxOSBEZWMgMjAxMiwgMDA6NTI6Mjc=", "value_id": "raw"}]}', 'time': 1355878347.744123} {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "parsed", "values": [{"value": 32.3919, "value_id": "temp"}, {"value": 91.09512, "value_id": "conductivity"}, {"value": 784.853, "value_id": "pressure"}]}', 'time': 1355878347.744127} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'port_timestamp': [None], 'lon': [None], 'raw': ['-4.9733,16.02390, 539.527, 34.2719, 1506.862, 19 Dec 2012, 01:03:07'], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117]} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'temp': [-4.9733], 'density': [None], 'port_timestamp': [None], 'lon': [None], 'salinity': [None], 'pressure': [539.527], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117], 'conductivity': [16.0239]} """ try: ### Flush the agent state to the object store. This was added for the dataset agent publishers who store ### their driver state in the object store. We had talked about about flushing the state after publiction ### by grabbing current state here, doing out work, and then saving this state. However, flush_state ### doesn't accept parameters. It seems more complex than simply flushing here. There is a slight downside ### if publishing fails then the state will be slightly out of sync. if self._flush_on_publish: log.debug("ASP Flush Agent State") self._agent._flush_state() buf_len = len(self._stream_buffers[stream_name]) if buf_len == 0: return stream_def = self._stream_defs[stream_name] if isinstance(stream_def, str): rdt = RecordDictionaryTool(stream_definition_id=stream_def) else: rdt = RecordDictionaryTool(stream_definition=stream_def) publisher = self._publishers[stream_name] vals = [] for x in xrange(buf_len): vals.append(self._stream_buffers[stream_name].pop()) rdt = populate_rdt(rdt, vals) #log.info('Outgoing granule: %s', #['%s: %s'%(k,v) for k,v in rdt.iteritems()]) #log.info('Outgoing granule preferred timestamp: %s' % rdt['preferred_timestamp'][0]) #log.info('Outgoing granule destined for stream: %s', stream_name) g = rdt.to_granule(data_producer_id=self._agent.resource_id, connection_id=self._connection_ID.hex, connection_index=str(self._connection_index[stream_name])) publisher.publish(g) #log.info('Instrument agent %s published data granule on stream %s.', #self._agent._proc_name, stream_name) #log.info('Connection id: %s, connection index: %i.', #self._connection_ID.hex, self._connection_index[stream_name]) self._connection_index[stream_name] += 1 except: log.exception('Instrument agent %s could not publish data on stream %s.', self._agent._proc_name, stream_name)
def test_lookup_values_ingest_replay(self): ph = ParameterHelper(self.dataset_management, self.addCleanup) pdict_id = ph.create_lookups() stream_def_id = self.pubsub_management.create_stream_definition('lookups', parameter_dictionary_id=pdict_id) self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id) stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id) self.addCleanup(self.pubsub_management.delete_stream, stream_id) ingestion_config_id = self.get_ingestion_config() dataset_id = self.create_dataset(pdict_id) config = DotDict() config.process.lookup_docs = ['test1', 'test2'] self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config) self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id) stored_value_manager = StoredValueManager(self.container) stored_value_manager.stored_value_cas('test1',{'offset_a':10.0, 'offset_b':13.1}) publisher = StandaloneStreamPublisher(stream_id, route) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor = DatasetMonitor(dataset_id) self.addCleanup(dataset_monitor.stop) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20)) rdt = RecordDictionaryTool(stream_definition_id=stream_def_id) rdt['time'] = np.arange(20,40) rdt['temp'] = [20.0] * 20 granule = rdt.to_granule() dataset_monitor.event.clear() stored_value_manager.stored_value_cas('test1',{'offset_a':20.0}) stored_value_manager.stored_value_cas('coefficient_document',{'offset_b':10.0}) gevent.sleep(2) publisher.publish(granule) self.assertTrue(dataset_monitor.event.wait(30)) replay_granule = self.data_retriever.retrieve(dataset_id) rdt_out = RecordDictionaryTool.load_from_granule(replay_granule) np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40)) np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20)) np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40)) np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20 + [40.]*20)) np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))
def _publish_stream_buffer(self, stream_name): """ ['quality_flag', 'preferred_timestamp', 'port_timestamp', 'lon', 'raw', 'internal_timestamp', 'time', 'lat', 'driver_timestamp'] ['quality_flag', 'preferred_timestamp', 'temp', 'density', 'port_timestamp', 'lon', 'salinity', 'pressure', 'internal_timestamp', 'time', 'lat', 'driver_timestamp', 'conductivit {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "raw", "values": [{"binary": true, "value": "MzIuMzkxOSw5MS4wOTUxMiwgNzg0Ljg1MywgICA2LjE5OTQsIDE1MDUuMTc5LCAxOSBEZWMgMjAxMiwgMDA6NTI6Mjc=", "value_id": "raw"}]}', 'time': 1355878347.744123} {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "parsed", "values": [{"value": 32.3919, "value_id": "temp"}, {"value": 91.09512, "value_id": "conductivity"}, {"value": 784.853, "value_id": "pressure"}]}', 'time': 1355878347.744127} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'port_timestamp': [None], 'lon': [None], 'raw': ['-4.9733,16.02390, 539.527, 34.2719, 1506.862, 19 Dec 2012, 01:03:07'], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117]} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'temp': [-4.9733], 'density': [None], 'port_timestamp': [None], 'lon': [None], 'salinity': [None], 'pressure': [539.527], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117], 'conductivity': [16.0239]} """ try: buf_len = len(self._stream_buffers[stream_name]) if buf_len == 0: return stream_def = self._stream_defs[stream_name] rdt = RecordDictionaryTool(stream_definition_id=stream_def) publisher = self._publishers[stream_name] vals = [] for x in range(buf_len): vals.append(self._stream_buffers[stream_name].pop()) data_arrays = {} for x in rdt.fields: data_arrays[x] = [None for y in range(buf_len)] for i in range(buf_len): tomato = vals[i] for (tk, tv) in tomato.iteritems(): if tk == 'values': for tval_dict in tv: tval_id = tval_dict['value_id'] if tval_id in rdt: tval_val = tval_dict['value'] if tval_dict.get('binary', None): tval_val = base64.b64decode(tval_val) data_arrays[tval_id][i] = tval_val elif tk in rdt: data_arrays[tk][i] = tv if tk == 'driver_timestamp': data_arrays['time'][i] = tv for (k,v) in data_arrays.iteritems(): rdt[k] = numpy.array(v) log.info('Outgoing granule: %s', ['%s: %s'%(k,v) for k,v in rdt.iteritems()]) g = rdt.to_granule(data_producer_id=self._agent.resource_id) g.connection_id = self._connection_ID.hex g.connection_index = self._connection_index[stream_name] publisher.publish(g) log.info('Instrument agent %s published data granule on stream %s.', self._agent._proc_name, stream_name) log.info('Connection id: %s, connection index: %i.', self._connection_ID.hex, self._connection_index[stream_name]) except: log.exception('Instrument agent %s could not publish data on stream %s.', self._agent._proc_name, stream_name) else: self._connection_index[stream_name] += 1
def _publish_stream_buffer(self, stream_name): """ ['quality_flag', 'preferred_timestamp', 'port_timestamp', 'lon', 'raw', 'internal_timestamp', 'time', 'lat', 'driver_timestamp'] ['quality_flag', 'preferred_timestamp', 'temp', 'density', 'port_timestamp', 'lon', 'salinity', 'pressure', 'internal_timestamp', 'time', 'lat', 'driver_timestamp', 'conductivit {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "raw", "values": [{"binary": true, "value": "MzIuMzkxOSw5MS4wOTUxMiwgNzg0Ljg1MywgICA2LjE5OTQsIDE1MDUuMTc5LCAxOSBEZWMgMjAxMiwgMDA6NTI6Mjc=", "value_id": "raw"}]}', 'time': 1355878347.744123} {"driver_timestamp": 3564867147.743795, "pkt_format_id": "JSON_Data", "pkt_version": 1, "preferred_timestamp": "driver_timestamp", "quality_flag": "ok", "stream_name": "parsed", "values": [{"value": 32.3919, "value_id": "temp"}, {"value": 91.09512, "value_id": "conductivity"}, {"value": 784.853, "value_id": "pressure"}]}', 'time': 1355878347.744127} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'port_timestamp': [None], 'lon': [None], 'raw': ['-4.9733,16.02390, 539.527, 34.2719, 1506.862, 19 Dec 2012, 01:03:07'], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117]} {'quality_flag': [u'ok'], 'preferred_timestamp': [u'driver_timestamp'], 'temp': [-4.9733], 'density': [None], 'port_timestamp': [None], 'lon': [None], 'salinity': [None], 'pressure': [539.527], 'internal_timestamp': [None], 'time': [3564867788.0627117], 'lat': [None], 'driver_timestamp': [3564867788.0627117], 'conductivity': [16.0239]} """ try: ### Flush the agent state to the object store. This was added for the dataset agent publishers who store ### their driver state in the object store. We had talked about about flushing the state after publiction ### by grabbing current state here, doing out work, and then saving this state. However, flush_state ### doesn't accept parameters. It seems more complex than simply flushing here. There is a slight downside ### if publishing fails then the state will be slightly out of sync. if self._flush_on_publish: log.debug("ASP Flush Agent State") self._agent._flush_state() buf_len = len(self._stream_buffers[stream_name]) if buf_len == 0: return stream_def = self._stream_defs[stream_name] if isinstance(stream_def, str): rdt = RecordDictionaryTool(stream_definition_id=stream_def) else: rdt = RecordDictionaryTool(stream_definition=stream_def) publisher = self._publishers[stream_name] vals = [] for x in xrange(buf_len): vals.append(self._stream_buffers[stream_name].pop()) rdt = populate_rdt(rdt, vals) #log.info('Outgoing granule: %s', #['%s: %s'%(k,v) for k,v in rdt.iteritems()]) #log.info('Outgoing granule preferred timestamp: %s' % rdt['preferred_timestamp'][0]) #log.info('Outgoing granule destined for stream: %s', stream_name) g = rdt.to_granule(data_producer_id=self._agent.resource_id, connection_id=self._connection_ID.hex, connection_index=str(self._connection_index[stream_name])) publisher.publish(g) #log.info('Instrument agent %s published data granule on stream %s.', #self._agent._proc_name, stream_name) #log.info('Connection id: %s, connection index: %i.', #self._connection_ID.hex, self._connection_index[stream_name]) self._connection_index[stream_name] += 1 except: log.exception('Instrument agent %s could not publish data on stream %s.', self._agent._proc_name, stream_name)