def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP,'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1 ) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id ) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition ) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id ) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id' ) ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id = ingestion_configuration_id ) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields=[ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path],fields , 2).next() producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.') assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path],fields,2).next() for field in fields: comparison = (input_vectors[field]['values']==output_vectors[field]['values']) assertions(comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'],output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def test_usgs_integration(self): ''' test_usgs_integration Test full DM Services Integration using usgs ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here #----------------------------- pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) process_list = [] datasets = [] datastore_name = 'test_usgs_integration' #--------------------------- # Set up ingestion #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=8 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) usgs_stream_def = USGS_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition') #--------------------------- # Set up the producers (CTD Simulators) #--------------------------- # Launch five simulated CTD producers for iteration in xrange(2): # Make a stream to output on stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #--------------------------- # Set up the datasets #--------------------------- dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Keep track of the datasets datasets.append(dataset_id) stream_policy_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'ion.agents.eoi.handler.usgs_stream_publisher', 'class':'UsgsPublisher' } configuration = { 'process':{ 'stream_id':stream_id, } } procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id) pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration) # Keep track, we'll kill 'em later. process_list.append(pid) # Get about 4 seconds of data time.sleep(4) #--------------------------- # Stop producing data #--------------------------- for process in process_list: process_dispatcher.cancel_process(process) #---------------------------------------------- # The replay and the transform, a love story. #---------------------------------------------- # Happy Valentines to the clever coder who catches the above! transform_definition = ProcessDefinition() transform_definition.executable = { 'module':'ion.processes.data.transforms.transform_example', 'class':'TransformCapture' } transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition) dataset_id = datasets.pop() # Just need one for now replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id) #-------------------------------------------- # I'm Selling magazine subscriptions here! #-------------------------------------------- subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]), exchange_name='transform_capture_point') #-------------------------------------------- # Start the transform (capture) #-------------------------------------------- transform_id = transform_management_service.create_transform( name='capture_transform', in_subscription_id=subscription, process_definition_id=transform_definition_id ) transform_management_service.activate_transform(transform_id=transform_id) #-------------------------------------------- # BEGIN REPLAY! #-------------------------------------------- data_retriever_service.start_replay(replay_id=replay_id) #-------------------------------------------- # Lets get some boundaries #-------------------------------------------- bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
class DataRetrieverServiceIntTest(IonIntegrationTestCase): def setUp(self): super(DataRetrieverServiceIntTest,self).setUp() self._start_container() self.container.start_rel_from_url('res/deploy/r2dm.yml') self.couch = self.container.datastore_manager.get_datastore('test_data_retriever', profile=DataStore.DS_PROFILE.EXAMPLES) self.datastore_name = 'test_data_retriever' self.dr_cli = DataRetrieverServiceClient(node=self.container.node) self.dsm_cli = DatasetManagementServiceClient(node=self.container.node) self.rr_cli = ResourceRegistryServiceClient(node=self.container.node) self.ps_cli = PubsubManagementServiceClient(node=self.container.node) def tearDown(self): super(DataRetrieverServiceIntTest,self).tearDown() @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_define_replay(self): dataset_id = self.dsm_cli.create_dataset( stream_id='12345', datastore_name=self.datastore_name, view_name='posts/posts_join_comments', name='test define replay' ) replay_id, stream_id = self.dr_cli.define_replay(dataset_id=dataset_id) replay = self.rr_cli.read(replay_id) # Assert that the process was created self.assertTrue(self.container.proc_manager.procs[replay.process_id]) self.dr_cli.cancel_replay(replay_id) @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_cancel_replay(self): dataset_id = self.dsm_cli.create_dataset( stream_id='12345', datastore_name=self.datastore_name, view_name='posts/posts_join_comments', name='test define replay' ) replay_id, stream_id = self.dr_cli.define_replay(dataset_id=dataset_id) replay = self.rr_cli.read(replay_id) # Assert that the process was created self.assertTrue(self.container.proc_manager.procs[replay.process_id]) self.dr_cli.cancel_replay(replay_id) # assert that the process is no more self.assertFalse(replay.process_id in self.container.proc_manager.procs) # assert that the resource no longer exists with self.assertRaises(NotFound): self.rr_cli.read(replay_id) @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_start_replay(self): post = BlogPost(title='test blog post', post_id='12345', author=BlogAuthor(name='Jon Doe'), content='this is a blog post', updated=time.strftime("%Y-%m-%dT%H:%M%S-05")) dataset_id = self.dsm_cli.create_dataset( stream_id='12345', datastore_name=self.datastore_name, view_name='posts/posts_join_comments', name='blog posts test' ) self.couch.create(post) replay_id, stream_id = self.dr_cli.define_replay(dataset_id) replay = self.rr_cli.read(replay_id) # assert that the process was created self.assertTrue(self.container.proc_manager.procs[replay.process_id]) # pattern from Tim G ar = gevent.event.AsyncResult() def consume(message, headers): ar.set(message) stream_subscriber = StreamSubscriberRegistrar(process=self.container, node=self.container.node) subscriber = stream_subscriber.create_subscriber(exchange_name='test_queue', callback=consume) subscriber.start() query = StreamQuery(stream_ids=[stream_id]) subscription_id = self.ps_cli.create_subscription(query=query,exchange_name='test_queue') self.ps_cli.activate_subscription(subscription_id) self.dr_cli.start_replay(replay_id) self.assertEqual(ar.get(timeout=10).post_id,post.post_id) subscriber.stop() @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_chop_chop(self): # Override couch self.couch = self.container.datastore_manager.get_datastore( ds_name='chopping_block', profile=DataStore.DS_PROFILE.SCIDATA ) self.datastore_name = 'chopping_block' granule = ctd_stream_packet( stream_id='this_is_only_a_test', time='12345', #Same combo on my luggage create_hdf=False ) self.couch.create(granule) log.debug("Granule: %s", granule) dataset_id = self.dsm_cli.create_dataset( stream_id='this_is_only_a_test', datastore_name=self.datastore_name, view_name='datasets/dataset_by_id', name='sci_data_granule_chop' ) replay_id, stream_id = self.dr_cli.define_replay( dataset_id=dataset_id, delivery_format={'chop':True} ) replay = self.rr_cli.read(replay_id) self.assertTrue(self.container.proc_manager.procs[replay.process_id]) async_result = gevent.event.AsyncResult() def consume(message, headers): async_result.set(message) stream_subscriber = StreamSubscriberRegistrar(process=self.container, node=self.container.node) subscriber = stream_subscriber.create_subscriber(exchange_name = 'chopping_block', callback=consume) subscriber.start() query = StreamQuery(stream_ids=[stream_id]) subscription_id = self.ps_cli.create_subscription(query=query, exchange_name='chopping_block') self.ps_cli.activate_subscription(subscription_id=subscription_id) self.dr_cli.start_replay(replay_id) for fields in xrange(4): self.assertTrue(async_result.get(timeout=10)) subscriber.stop() self.dr_cli.cancel_replay(replay_id=replay_id)
def test_blog_ingestion_replay(self): #----------------------------------------------------------------------------------------------------- # Do this statement just once in your script #----------------------------------------------------------------------------------------------------- cc = self.container #------------------------------------------------------------------------------------------------------- # Make a registrar object - this is work usually done for you by the container in a transform or data stream process #------------------------------------------------------------------------------------------------------- subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node) #----------------------------------------------------------------------------------------------------- # Service clients #----------------------------------------------------------------------------------------------------- ingestion_cli = IngestionManagementServiceClient(node=cc.node) dr_cli = DataRetrieverServiceClient(node=cc.node) dsm_cli = DatasetManagementServiceClient(node=cc.node) pubsub_cli = PubsubManagementServiceClient(node=cc.node) #------------------------------------------------------------------------------------------------------- # Create and activate ingestion configuration #------------------------------------------------------------------------------------------------------- ingestion_configuration_id = ingestion_cli.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name='dm_datastore',datastore_profile='EXAMPLES'), hdf_storage=HdfStorage(), number_of_workers=6, ) # activates the transforms... so bindings will be created in this step ingestion_cli.activate_ingestion_configuration(ingestion_configuration_id) #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the messages published to the ingestion #------------------------------------------------------------------------------------------------------ # Define the query we want query = ExchangeQuery() # Create the stateful listener to hold the captured data for comparison with replay captured_input = BlogListener() # Make a subscription to the input stream to ingestion subscription_id = pubsub_cli.create_subscription(query = query, exchange_name='input_capture_queue' ,name = 'input_capture_queue') # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name='input_capture_queue', callback=captured_input.blog_store) subscriber.start() captured_input.subscriber = subscriber pubsub_cli.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------- # Launching blog scraper #------------------------------------------------------------------------------------------------------- blogs = [ 'saintsandspinners', 'strobist', 'voodoofunk' ] log.debug('before spawning blog scraper') for blog in blogs: config = {'process':{'type':'stream_process','blog':blog}} cc.spawn_process(name=blog, module='ion.services.dm.ingestion.example.blog_scraper', cls='FeedStreamer', config=config) # wait ten seconds for some data to come in... log.warn('Sleeping for 10 seconds to wait for some input') time.sleep(10) #------------------------------------------------------------------------------------------------------ # For 3 posts captured, make 3 replays and verify we get back what came in #------------------------------------------------------------------------------------------------------ # Cute list comprehension method does not give enough control #self.assertTrue(len(captured_input.blogs)>3) #post_ids = [id for idx, id in enumerate(captured_input.blogs.iterkeys()) if idx < 3] post_ids = [] for post_id, blog in captured_input.blogs.iteritems(): # Use items not iter items - I copy of fixed length log.info('Captured Input: %s' % post_id) if len(blog.get('comments',[])) > 2: post_ids.append(post_id) if len(post_ids) >3: break ###======================================================= ### This section is not scriptable ###======================================================= if len(post_ids) < 3: self.fail('Not enough comments returned by the blog scrappers in 30 seconds') if len(captured_input.blogs) < 1: self.fail('No data returned in ten seconds by the blog scrappers!') ###======================================================= ### End non-scriptable ###======================================================= #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the replays #------------------------------------------------------------------------------------------------------ captured_replays = {} for idx, post_id in enumerate(post_ids): # Create the stateful listener to hold the captured data for comparison with replay dataset_id = dsm_cli.create_dataset( stream_id=post_id, datastore_name='dm_datastore', view_name='posts/posts_join_comments') replay_id, stream_id =dr_cli.define_replay(dataset_id) query = StreamQuery(stream_ids=[stream_id]) captured_replay = BlogListener() #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the messages published to the ingestion #------------------------------------------------------------------------------------------------------ # Make a subscription to the input stream to ingestion subscription_name = 'replay_capture_queue_%d' % idx subscription_id = pubsub_cli.create_subscription(query = query, exchange_name=subscription_name ,name = subscription_name) # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name=subscription_name, callback=captured_replay.blog_store) subscriber.start() captured_replay.subscriber = subscriber pubsub_cli.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------ # Start the replay and listen to the results! #------------------------------------------------------------------------------------------------------ dr_cli.start_replay(replay_id) captured_replays[post_id] = captured_replay ###======================================================= ### The rest is not scriptable ###======================================================= # wait five seconds for some data to come in... log.warn('Sleeping for 5 seconds to wait for some output') time.sleep(5) matched_comments={} for post_id, captured_replay in captured_replays.iteritems(): # There should be only one blog in here! self.assertEqual(len(captured_replay.blogs),1) replayed_blog = captured_replay.blogs[post_id] input_blog = captured_input.blogs[post_id] self.assertEqual(replayed_blog['post'].content, input_blog['post'].content) # can't deterministically assert that the number of comments is the same... matched_comments[post_id] = 0 for updated, comment in replayed_blog.get('comments',{}).iteritems(): self.assertIn(updated, input_blog['comments']) matched_comments[post_id] += 1 # Assert that we got some comments back! self.assertTrue(sum(matched_comments.values()) > 0) log.info('Matched comments on the following blogs: %s' % matched_comments)
class DMCollaborationIntTest(IonIntegrationTestCase): def setUp(self): self._start_container() config = DotDict() config.bootstrap.processes.ingestion.module = 'ion.processes.data.ingestion.ingestion_worker_a' config.bootstrap.processes.replay.module = 'ion.processes.data.replay.replay_process_a' self.container.start_rel_from_url('res/deploy/r2dm.yml', config) self.datastore_name = 'test_datasets' self.pubsub_management = PubsubManagementServiceClient() self.ingestion_management = IngestionManagementServiceClient() self.dataset_management = DatasetManagementServiceClient() self.process_dispatcher = ProcessDispatcherServiceClient() self.data_retriever = DataRetrieverServiceClient() def subscriber_action(self, msg, header): if not hasattr(self,'received'): self.received = 0 if not hasattr(self, 'async_done'): self.async_done = AsyncResult() self.received += 1 if self.received >= 2: self.async_done.set(True) def test_ingest_to_replay(self): self.async_done = AsyncResult() sysname = get_sys_name() datastore = self.container.datastore_manager.get_datastore(self.datastore_name,'SCIDATA') producer_definition = ProcessDefinition(name='Example Data Producer') producer_definition.executable = { 'module':'ion.processes.data.example_data_producer', 'class' :'ExampleDataProducer' } process_definition_id = self.process_dispatcher.create_process_definition(process_definition=producer_definition) ingestion_configuration_id = self.ingestion_management.create_ingestion_configuration( exchange_point_id = 'science_data', couch_storage=CouchStorage(datastore_name=self.datastore_name,datastore_profile='SCIDATA'), number_of_workers=1 ) self.ingestion_management.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) stream_id = self.pubsub_management.create_stream(name='data stream') dataset_id = self.dataset_management.create_dataset( stream_id = stream_id, datastore_name = self.datastore_name, ) self.ingestion_management.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) configuration = { 'process': { 'stream_id' : stream_id } } self.process_dispatcher.schedule_process(process_definition_id, configuration=configuration) replay_id, stream_id = self.data_retriever.define_replay(dataset_id = dataset_id) subscriber = Subscriber(name=('%s.science_data' % sysname, 'test_queue'), callback=self.subscriber_action, binding='%s.data' % stream_id) gevent.spawn(subscriber.listen) done = False while not done: results = datastore.query_view('manifest/by_dataset') if len(results) >= 2: done = True self.data_retriever.start_replay(replay_id) self.async_done.get(timeout=10)
def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP, 'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[ data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id') ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id=ingestion_configuration_id) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields = [ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path], fields, 2).next() producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay( dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg, StreamGranuleContainer), 'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1, 'Checksum failed.') assertions( msg.identifiables[element_count_id].value == 1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path], fields, 2).next() for field in fields: comparison = (input_vectors[field]['values'] == output_vectors[field]['values']) assertions( comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'], output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP, 'replay listener'), callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def test_usgs_integration(self): ''' test_usgs_integration Test full DM Services Integration using usgs ''' cc = self.container assertions = self.assertTrue #----------------------------- # Copy below here #----------------------------- pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) transform_management_service = TransformManagementServiceClient(node=cc.node) process_dispatcher = ProcessDispatcherServiceClient(node=cc.node) process_list = [] datasets = [] datastore_name = 'test_usgs_integration' #--------------------------- # Set up ingestion #--------------------------- # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile log.debug('Calling create_ingestion_configuration') ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), number_of_workers=8 ) # ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) usgs_stream_def = USGS_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition') #--------------------------- # Set up the producers (CTD Simulators) #--------------------------- # Launch five simulated CTD producers for iteration in xrange(2): # Make a stream to output on stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #--------------------------- # Set up the datasets #--------------------------- dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) # Keep track of the datasets datasets.append(dataset_id) stream_policy_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) producer_definition = ProcessDefinition() producer_definition.executable = { 'module':'eoi.agent.handler.usgs_stream_publisher', 'class':'UsgsPublisher' } configuration = { 'process':{ 'stream_id':stream_id, } } procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition) log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id) pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration) # Keep track, we'll kill 'em later. process_list.append(pid) # Get about 4 seconds of data time.sleep(4) #--------------------------- # Stop producing data #--------------------------- for process in process_list: process_dispatcher.cancel_process(process) #---------------------------------------------- # The replay and the transform, a love story. #---------------------------------------------- # Happy Valentines to the clever coder who catches the above! transform_definition = ProcessDefinition() transform_definition.executable = { 'module':'ion.processes.data.transforms.transform_example', 'class':'TransformCapture' } transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition) dataset_id = datasets.pop() # Just need one for now replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id) #-------------------------------------------- # I'm Selling magazine subscriptions here! #-------------------------------------------- subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]), exchange_name='transform_capture_point') #-------------------------------------------- # Start the transform (capture) #-------------------------------------------- transform_id = transform_management_service.create_transform( name='capture_transform', in_subscription_id=subscription, process_definition_id=transform_definition_id ) transform_management_service.activate_transform(transform_id=transform_id) #-------------------------------------------- # BEGIN REPLAY! #-------------------------------------------- data_retriever_service.start_replay(replay_id=replay_id) #-------------------------------------------- # Lets get some boundaries #-------------------------------------------- bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
def test_replay_integration(self): ''' Test full DM Services Integration ''' cc = self.container ### Every thing below here can be run as a script: pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) resource_registry_service = ResourceRegistryServiceClient(node=cc.node) #------------------------------------------------------------------------------------------------------ # Datastore name #------------------------------------------------------------------------------------------------------ datastore_name = 'test_replay_integration' #------------------------------------------------------------------------------------------------------ # Spawn process #------------------------------------------------------------------------------------------------------ pid = cc.spawn_process(name='dummy_process_for_test', module='pyon.ion.process', cls='SimpleProcess', config={}) dummy_process = cc.proc_manager.procs[pid] #------------------------------------------------------------------------------------------------------ # Set up subscriber #------------------------------------------------------------------------------------------------------ # Normally the user does not see or create the publisher, this is part of the containers business. # For the test we need to set it up explicitly publisher_registrar = StreamPublisherRegistrar(process=dummy_process, node=cc.node) subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node) #------------------------------------------------------------------------------------------------------ # Set up ingestion #------------------------------------------------------------------------------------------------------ # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id='science_data', couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1, ) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) #------------------------------------------------------------------------------------------------------ # Grab the transforms acting as ingestion workers #------------------------------------------------------------------------------------------------------ transforms = [resource_registry_service.read(assoc.o) for assoc in resource_registry_service.find_associations(ingestion_configuration_id, PRED.hasTransform)] proc_1 = cc.proc_manager.procs[transforms[0].process_id] log.info("PROCESS 1: %s" % str(proc_1)) #------------------------------------------------------------------------------------------------------ # Set up the test hooks for the gevent event AsyncResult object #------------------------------------------------------------------------------------------------------ def ingestion_worker_received(message, headers): ar.set(message) proc_1.ingest_process_test_hook = ingestion_worker_received #------------------------------------------------------------------------------------------------------ # Set up the producers (CTD Simulators) #------------------------------------------------------------------------------------------------------ ctd_stream_def = ctd_stream_definition() stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition') stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id) #------------------------------------------------------------------------------------------------------ # Set up the dataset config #------------------------------------------------------------------------------------------------------ dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/stream_join_granule' ) dataset_config_id = ingestion_management_service.create_dataset_configuration( dataset_id = dataset_id, archive_data = True, archive_metadata = True, ingestion_configuration_id = ingestion_configuration_id ) #------------------------------------------------------------------------------------------------------ # Launch a ctd_publisher #------------------------------------------------------------------------------------------------------ publisher = publisher_registrar.create_publisher(stream_id=stream_id) #------------------------------------------------------------------------ # Create a packet and publish it #------------------------------------------------------------------------ ctd_packet = _create_packet(stream_id) published_hdfstring = ctd_packet.identifiables['ctd_data'].values publisher.publish(ctd_packet) #------------------------------------------------------------------------------------------------------ # Catch what the ingestion worker gets! Assert it is the same packet that was published! #------------------------------------------------------------------------------------------------------ packet = ar.get(timeout=2) #------------------------------------------------------------------------------------------------------ # Create subscriber to listen to the replays #------------------------------------------------------------------------------------------------------ replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id) query = StreamQuery(stream_ids=[replay_stream_id]) subscription_id = pubsub_management_service.create_subscription(query = query, exchange_name='replay_capture_point' ,name = 'replay_capture_point') # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here # Normally the container creates and starts subscribers for you when a transform process is spawned subscriber = subscriber_registrar.create_subscriber(exchange_name='replay_capture_point', callback=_subscriber_call_back) subscriber.start() pubsub_management_service.activate_subscription(subscription_id) #------------------------------------------------------------------------------------------------------ # Start the replay #------------------------------------------------------------------------------------------------------ data_retriever_service.start_replay(replay_id) #------------------------------------------------------------------------------------------------------ # Get the hdf string from the captured stream in the replay #------------------------------------------------------------------------------------------------------ retrieved_hdf_string = ar2.get(timeout=2) ### Non scriptable portion of the test #------------------------------------------------------------------------------------------------------ # Assert that it matches the message we sent #------------------------------------------------------------------------------------------------------ self.assertEquals(packet.identifiables['stream_encoding'].sha1, ctd_packet.identifiables['stream_encoding'].sha1) self.assertEquals(retrieved_hdf_string, published_hdfstring)
class TestDMEnd2End(IonIntegrationTestCase): def setUp(self): # Love the non pep-8 convention self._start_container() self.container.start_rel_from_url("res/deploy/r2deploy.yml") self.process_dispatcher = ProcessDispatcherServiceClient() self.pubsub_management = PubsubManagementServiceClient() self.resource_registry = ResourceRegistryServiceClient() self.dataset_management = DatasetManagementServiceClient() self.ingestion_management = IngestionManagementServiceClient() self.data_retriever = DataRetrieverServiceClient() self.pids = [] self.event = Event() self.exchange_space_name = "test_granules" self.exchange_point_name = "science_data" self.purge_queues() def purge_queues(self): xn = self.container.ex_manager.create_xn_queue("science_granule_ingestion") xn.purge() def tearDown(self): self.purge_queues() for pid in self.pids: self.process_dispatcher.cancel_process(pid) IngestionManagementIntTest.clean_subscriptions() def launch_producer(self, stream_id=""): # -------------------------------------------------------------------------------- # Create the process definition for the producer # -------------------------------------------------------------------------------- producer_definition = ProcessDefinition(name="Example Data Producer") producer_definition.executable = { "module": "ion.processes.data.example_data_producer", "class": "BetterDataProducer", } process_definition_id = self.process_dispatcher.create_process_definition( process_definition=producer_definition ) # -------------------------------------------------------------------------------- # Launch the producer # -------------------------------------------------------------------------------- config = DotDict() config.process.stream_id = stream_id pid = self.process_dispatcher.schedule_process( process_definition_id=process_definition_id, configuration=config ) self.pids.append(pid) def get_ingestion_config(self): # -------------------------------------------------------------------------------- # Grab the ingestion configuration from the resource registry # -------------------------------------------------------------------------------- # The ingestion configuration should have been created by the bootstrap service # which is configured through r2deploy.yml ingest_configs, _ = self.resource_registry.find_resources(restype=RT.IngestionConfiguration, id_only=True) return ingest_configs[0] def publish_hifi(self, stream_id, offset=0): pub = SimpleStreamPublisher.new_publisher(self.container, self.exchange_point_name, stream_id) black_box = CoverageCraft() black_box.rdt["time"] = np.arange(10) + (offset * 10) black_box.rdt["temp"] = (np.arange(10) + (offset * 10)) * 2 granule = black_box.to_granule() pub.publish(granule) def publish_fake_data(self, stream_id): for i in xrange(4): self.publish_hifi(stream_id, i) def get_datastore(self, dataset_id): dataset = self.dataset_management.read_dataset(dataset_id) datastore_name = dataset.datastore_name datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA) return datastore def validate_granule_subscription(self, msg, header): if msg == {}: return self.assertIsInstance(msg, Granule, "Message is improperly formatted. (%s)" % type(msg)) self.event.set() def wait_until_we_have_enough_granules(self, dataset_id="", granules=4): datastore = self.get_datastore(dataset_id) dataset = self.dataset_management.read_dataset(dataset_id) now = time.time() timeout = now + 10 done = False while not done: if now >= timeout: raise Timeout("Granules are not populating in time.") if len(datastore.query_view(dataset.view_name)) >= granules: done = True now = time.time() def create_dataset(self): craft = CoverageCraft sdom, tdom = craft.create_domains() sdom = sdom.dump() tdom = tdom.dump() pdict = craft.create_parameters() pdict = pdict.dump() dataset_id = self.dataset_management.create_dataset( "test_dataset", parameter_dict=pdict, spatial_domain=sdom, temporal_domain=tdom ) return dataset_id def test_coverage_ingest(self): stream_id = self.pubsub_management.create_stream() dataset_id = self.create_dataset() # I freaking hate this bug self.get_datastore(dataset_id) ingestion_config_id = self.get_ingestion_config() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id ) black_box = CoverageCraft() black_box.rdt["time"] = np.arange(20) black_box.rdt["temp"] = np.random.random(20) * 10 black_box.sync_with_granule() granule = black_box.to_granule() publisher = SimpleStreamPublisher.new_publisher(self.container, self.exchange_point_name, stream_id) publisher.publish(granule) self.wait_until_we_have_enough_granules(dataset_id, 1) coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage() comp = black_box.rdt["time"] == np.arange(20) self.assertTrue(comp.all()) black_box = CoverageCraft() black_box.rdt["time"] = np.arange(20) + 20 black_box.rdt["temp"] = np.random.random(20) * 10 black_box.sync_with_granule() granule = black_box.to_granule() publisher.publish(granule) self.wait_until_we_have_enough_granules(dataset_id, 2) coverage = DatasetManagementService._get_coverage(dataset_id) black_box = CoverageCraft(coverage) black_box.sync_rdt_with_coverage() comp = black_box.rdt["time"] == np.arange(40) self.assertTrue(comp.all()) granule = self.data_retriever.retrieve(dataset_id) black_box = CoverageCraft() black_box.sync_rdt_with_granule(granule) comp = black_box.rdt["time"] == np.arange(40) self.assertTrue(comp.all()) @attr("SMOKE") def test_dm_end_2_end(self): # -------------------------------------------------------------------------------- # Set up a stream and have a mock instrument (producer) send data # -------------------------------------------------------------------------------- stream_id = self.pubsub_management.create_stream() self.launch_producer(stream_id) # -------------------------------------------------------------------------------- # Start persisting the data on the stream # - Get the ingestion configuration from the resource registry # - Create the dataset # - call persist_data_stream to setup the subscription for the ingestion workers # on the stream that you specify which causes the data to be persisted # -------------------------------------------------------------------------------- ingest_config_id = self.get_ingestion_config() dataset_id = self.create_dataset() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id ) # -------------------------------------------------------------------------------- # Now the granules are ingesting and persisted # -------------------------------------------------------------------------------- self.wait_until_we_have_enough_granules(dataset_id, 4) # -------------------------------------------------------------------------------- # Now get the data in one chunk using an RPC Call to start_retreive # -------------------------------------------------------------------------------- replay_data = self.data_retriever.retrieve(dataset_id) self.assertIsInstance(replay_data, Granule) # -------------------------------------------------------------------------------- # Now to try the streamed approach # -------------------------------------------------------------------------------- replay_id, stream_id = self.data_retriever.define_replay(dataset_id) # -------------------------------------------------------------------------------- # Create the listening endpoint for the the retriever to talk to # -------------------------------------------------------------------------------- xp = self.container.ex_manager.create_xp(self.exchange_point_name) xn = self.container.ex_manager.create_xn_queue(self.exchange_space_name) xn.bind("%s.data" % stream_id, xp) subscriber = SimpleStreamSubscriber.new_subscriber( self.container, self.exchange_space_name, self.validate_granule_subscription ) subscriber.start() self.data_retriever.start_replay(replay_id) fail = False try: self.event.wait(10) except gevent.Timeout: fail = True subscriber.stop() self.assertTrue(not fail, "Failed to validate the data.") def test_replay_by_time(self): log.info("starting test...") # -------------------------------------------------------------------------------- # Create the necessary configurations for the test # -------------------------------------------------------------------------------- stream_id = self.pubsub_management.create_stream() config_id = self.get_ingestion_config() dataset_id = self.create_dataset() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id ) # -------------------------------------------------------------------------------- # Create the datastore first, # -------------------------------------------------------------------------------- # There is a race condition sometimes between the services and the process for # the creation of the datastore and it's instance, this ensures the datastore # exists before the process is even subscribing to data. self.get_datastore(dataset_id) self.publish_fake_data(stream_id) self.wait_until_we_have_enough_granules(dataset_id, 2) # I just need two replay_granule = self.data_retriever.retrieve(dataset_id, {"start_time": 0, "end_time": 6}) rdt = RecordDictionaryTool.load_from_granule(replay_granule) comp = rdt["time"] == np.array([0, 1, 2, 3, 4, 5]) try: log.info("Compared granule: %s", replay_granule.__dict__) log.info("Granule tax: %s", replay_granule.taxonomy.__dict__) except: pass self.assertTrue(comp.all()) def test_last_granule(self): # -------------------------------------------------------------------------------- # Create the necessary configurations for the test # -------------------------------------------------------------------------------- stream_id = self.pubsub_management.create_stream() config_id = self.get_ingestion_config() dataset_id = self.create_dataset() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id ) # -------------------------------------------------------------------------------- # Create the datastore first, # -------------------------------------------------------------------------------- self.get_datastore(dataset_id) self.publish_hifi(stream_id, 0) self.publish_hifi(stream_id, 1) self.wait_until_we_have_enough_granules(dataset_id, 2) # I just need two replay_granule = self.data_retriever.retrieve_last_granule(dataset_id) rdt = RecordDictionaryTool.load_from_granule(replay_granule) comp = rdt["time"] == np.arange(10) + 10 self.assertTrue(comp.all()) def test_replay_with_parameters(self): # -------------------------------------------------------------------------------- # Create the configurations and the dataset # -------------------------------------------------------------------------------- stream_id = self.pubsub_management.create_stream() config_id = self.get_ingestion_config() dataset_id = self.create_dataset() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id ) # -------------------------------------------------------------------------------- # Coerce the datastore into existence (beats race condition) # -------------------------------------------------------------------------------- self.get_datastore(dataset_id) self.launch_producer(stream_id) self.wait_until_we_have_enough_granules(dataset_id, 4) query = {"start_time": 0, "end_time": 20, "parameters": ["time", "temp"]} retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id, query=query) rdt = RecordDictionaryTool.load_from_granule(retrieved_data) comp = np.arange(20) == rdt["time"] self.assertTrue(comp.all(), "%s" % rdt.pretty_print()) self.assertEquals(set(rdt.iterkeys()), set(["time", "temp"])) def test_repersist_data(self): stream_id = self.pubsub_management.create_stream() config_id = self.get_ingestion_config() dataset_id = self.create_dataset() self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id ) self.get_datastore(dataset_id) self.publish_hifi(stream_id, 0) self.publish_hifi(stream_id, 1) self.wait_until_we_have_enough_granules(dataset_id, 2) self.ingestion_management.unpersist_data_stream(stream_id=stream_id, ingestion_configuration_id=config_id) self.ingestion_management.persist_data_stream( stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id ) self.publish_hifi(stream_id, 2) self.publish_hifi(stream_id, 3) self.wait_until_we_have_enough_granules(dataset_id, 4) retrieved_granule = self.data_retriever.retrieve(dataset_id) rdt = RecordDictionaryTool.load_from_granule(retrieved_granule) comp = rdt["time"] == np.arange(0, 40) self.assertTrue(comp.all(), "Uh-oh: %s" % rdt["time"])