Пример #1
0
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli = DataProductManagementServiceClient()
        self.rrclient = ResourceRegistryServiceClient()
        self.damsclient = DataAcquisitionManagementServiceClient()
        self.pubsubcli = PubsubManagementServiceClient()
        self.ingestclient = IngestionManagementServiceClient()
        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()
        self.identcli = IdentityManagementServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        self.stream_def_id = self.pubsubcli.create_stream_definition(
            name='SBE37_CDM')

        self.process_definitions = {}
        ingestion_worker_definition = ProcessDefinition(
            name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':
            'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class': 'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(
            process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space = 'science_granule_ingestion'
        self.exchange_point = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(
            self.process_definitions['ingestion_worker'], configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)
Пример #2
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.ingest_name = 'basic'
        self.exchange = 'testdata'
    def __init__(self):
        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.data_product_management = DataProductManagementServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self._paused_streams = []
        self._w_covs = {}
        self._ro_covs = {}

        self._context_managed = False
Пример #4
0
    def setUp(self):
        import couchdb
        super(DatasetManagementIntTest,self).setUp()
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2dm.yml')


        self.db = self.container.datastore_manager.get_datastore('scidata', DataStore.DS_PROFILE.SCIDATA)
        self.db_raw = self.db.server

        self.dataset_management_client = DatasetManagementServiceClient(node=self.container.node)
        self.ingestion_client = IngestionManagementServiceClient(node=self.container.node)
 def clean_subscriptions():
     ingestion_management = IngestionManagementServiceClient()
     pubsub = PubsubManagementServiceClient()
     rr     = ResourceRegistryServiceClient()
     ingestion_config_ids = ingestion_management.list_ingestion_configurations(id_only=True)
     for ic in ingestion_config_ids:
         subscription_ids, assocs = rr.find_objects(subject=ic, predicate=PRED.hasSubscription, id_only=True)
         for subscription_id, assoc in zip(subscription_ids, assocs):
             rr.delete_association(assoc)
             try:
                 pubsub.deactivate_subscription(subscription_id)
             except:
                 log.exception("Unable to decativate subscription: %s", subscription_id)
             pubsub.delete_subscription(subscription_id)
Пример #6
0
 def clean_subscriptions():
     ingestion_management = IngestionManagementServiceClient()
     pubsub = PubsubManagementServiceClient()
     rr     = ResourceRegistryServiceClient()
     ingestion_config_ids = ingestion_management.list_ingestion_configurations(id_only=True)
     for ic in ingestion_config_ids:
         subscription_ids, assocs = rr.find_objects(subject=ic, predicate=PRED.hasSubscription, id_only=True)
         for subscription_id, assoc in zip(subscription_ids, assocs):
             rr.delete_association(assoc)
             try:
                 pubsub.deactivate_subscription(subscription_id)
             except:
                 log.exception("Unable to decativate subscription: %s", subscription_id)
             pubsub.delete_subscription(subscription_id)
    def clean_subscriptions():
        ingestion_management = IngestionManagementServiceClient()
        pubsub = PubsubManagementServiceClient()
        rr = ResourceRegistryServiceClient()
        ingestion_config_ids = ingestion_management.list_ingestion_configurations(id_only=True)
        for ic in ingestion_config_ids:

            assocs = rr.find_associations(subject=ic, predicate=PRED.hasSubscription, id_only=False)
            for assoc in assocs:
                rr.delete_association(assoc)
                try:
                    pubsub.deactivate_subscription(assoc.o)
                except:
                    pass
                pubsub.delete_subscription(assoc.o)
Пример #8
0
    def setUp(self): # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.pubsub_management    = PubsubManagementServiceClient()
        self.resource_registry    = ResourceRegistryServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()
        self.event                = Event()
        self.exchange_space_name  = 'test_granules'
        self.exchange_point_name  = 'science_data'       
        self.i                    = 0
        self.cci                  = 0
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()
        #container = Container()
        #print 'starting container'
        #container.start()
        #print 'started container'

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        print 'started services'

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.pubsubcli = PubsubManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.dpclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.datasetclient = DatasetManagementServiceClient(
            node=self.container.node)

        #setup listerner vars
        self._data_greenlets = []
        self._no_samples = None
        self._samples_received = []
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubclient =  PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.dpmsclient = DataProductManagementServiceClient(node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(node=self.container.node)
        self.omsclient = ObservatoryManagementServiceClient(node=self.container.node)
        self.process_dispatcher   = ProcessDispatcherServiceClient()

        self.dataset_management = DatasetManagementServiceClient()

        # create missing data process definition
        dpd_obj = IonObject(RT.DataProcessDefinition,
                            name=LOGICAL_TRANSFORM_DEFINITION_NAME,
                            description="normally in preload",
                            module='ion.processes.data.transforms.logical_transform',
                            class_name='logical_transform')
        self.dataprocessclient.create_data_process_definition(dpd_obj)

        # deactivate all data processes when tests are complete
        def killAllDataProcesses():
            for proc_id in self.rrclient.find_resources(RT.DataProcess, None, None, True)[0]:
                self.dataprocessclient.deactivate_data_process(proc_id)
                self.dataprocessclient.delete_data_process(proc_id)
        self.addCleanup(killAllDataProcesses)
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()
        #container = Container()
        #print 'starting container'
        #container.start()
        #print 'started container

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        print 'started services'

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.dataproductclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(
            node=self.container.node)
        self.datasetclient = DatasetManagementServiceClient(
            node=self.container.node)
        self.processdispatchclient = ProcessDispatcherServiceClient(
            node=self.container.node)
        self.dataset_management = self.datasetclient
Пример #12
0
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.dpmsclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(
            node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.omsclient = ObservatoryManagementServiceClient(
            node=self.container.node)
        self.process_dispatcher = ProcessDispatcherServiceClient()

        self.dataset_management = DatasetManagementServiceClient()

        # deactivate all data processes when tests are complete
        def killAllDataProcesses():
            for proc_id in self.rrclient.find_resources(
                    RT.DataProcess, None, None, True)[0]:
                self.dataprocessclient.deactivate_data_process(proc_id)
                self.dataprocessclient.delete_data_process(proc_id)

        self.addCleanup(killAllDataProcesses)
Пример #13
0
    def setUp(self):
        # Start container

        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.dataproductclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(
            node=self.container.node)
        self.datasetclient = DatasetManagementServiceClient(
            node=self.container.node)
        self.workflowclient = WorkflowManagementServiceClient(
            node=self.container.node)
        self.process_dispatcher = ProcessDispatcherServiceClient(
            node=self.container.node)

        self.ctd_stream_def = SBE37_CDM_stream_definition()
Пример #14
0
    def on_start(self):
        super(IngestionLauncher, self).on_start()
        exchange_point = self.CFG.get("process", {}).get("exchange_point", "science_data")
        couch_storage = self.CFG.get("process", {}).get("couch_storage", {})
        couch_storage = CouchStorage(**couch_storage)
        hdf_storage = self.CFG.get("process", {}).get("hdf_storage", {})
        number_of_workers = self.CFG.get("process", {}).get("number_of_workers", 2)

        ingestion_management_service = IngestionManagementServiceClient(node=self.container.node)
        ingestion_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=exchange_point,
            couch_storage=couch_storage,
            hdf_storage=hdf_storage,
            number_of_workers=number_of_workers,
            default_policy={},
        )
        ingestion_management_service.activate_ingestion_configuration(ingestion_id)
Пример #15
0
    def on_start(self):
        super(IngestionLauncher,self).on_start()

        exchange_point = self.CFG.get_safe('ingestion.exchange_point','science_data')
        couch_opts = self.CFG.get_safe('ingestion.couch_storage',{})
        couch_storage = CouchStorage(**couch_opts)
        hdf_opts = self.CFG.get_safe('ingestion.hdf_storage',{})
        hdf_storage = HdfStorage(**hdf_opts)
        number_of_workers = self.CFG.get_safe('ingestion.number_of_workers',2)

        ingestion_management_service = IngestionManagementServiceClient(node=self.container.node)
        ingestion_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=exchange_point,
            couch_storage=couch_storage,
            hdf_storage=hdf_storage,
            number_of_workers=number_of_workers
        )
        ingestion_management_service.activate_ingestion_configuration(ingestion_id)
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.ingest_name = "basic"
        self.exchange = "testdata"
Пример #17
0
class DatasetManagementIntTest(IonIntegrationTestCase):
    def setUp(self):
        import couchdb
        super(DatasetManagementIntTest,self).setUp()
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2dm.yml')


        self.db = self.container.datastore_manager.get_datastore('scidata', DataStore.DS_PROFILE.SCIDATA)
        self.db_raw = self.db.server

        self.dataset_management_client = DatasetManagementServiceClient(node=self.container.node)
        self.ingestion_client = IngestionManagementServiceClient(node=self.container.node)

    def _random_data(self, entropy):
        random_pressures = [(random.random()*100) for i in xrange(entropy)]
        random_salinity = [(random.random()*28) for i in xrange(entropy)]
        random_temperature = [(random.random()*10)+32 for i in xrange(entropy)]
        random_times = [random.randrange(1328205227, 1328896395) for i in xrange(entropy)]
        random_lat = [(random.random()*10)+30 for i in xrange(entropy)]
        random_lon = [(random.random()*10)+70 for i in xrange(entropy)]
        return [random_pressures, random_salinity, random_temperature, random_times, random_lat, random_lon]

    def _generate_point(self, entropy=5):
        points = []
        random_values = self._random_data(entropy)
        point = ctd_stream_packet(stream_id='test_data', p=random_values[0], c=random_values[1], t=random_values[2],time=random_values[3], lat=random_values[4], lon=random_values[5], create_hdf=False)
        return point


    def test_get_dataset_bounds(self):
        for i in xrange(3):
            point = self._generate_point()
            self.db.create(point)

        dataset_id = self.dataset_management_client.create_dataset(stream_id='test_data', datastore_name='scidata')


        bounds = self.dataset_management_client.get_dataset_bounds(dataset_id=dataset_id)

        self.assertTrue(bounds['latitude_bounds'][0] > 30.0)
        self.assertTrue(bounds['latitude_bounds'][1] < 40.0)
        self.assertTrue(bounds['longitude_bounds'][0] > 70.0)
        self.assertTrue(bounds['longitude_bounds'][1] < 80.0)

        self.dataset_management_client.delete_dataset(dataset_id)
        
    @unittest.skip('not ready yet')
    def test_dataset_ingestion(self):
        couch_storage = { 'server':'localhost', 'database':'scidata'}
        ingestion_configuration_id = self.ingestion_client.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=couch_storage,
            hdf_storage={},
            number_of_workers=4,
            default_policy={})
    def setUp(self):
        self.datastore_name = 'datasets'
        self.exchange_point = 'science_data'
        self.exchange_space = 'science_granule_ingestion'
        self.queue_name     = self.exchange_space
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2dm.yml')

        self.ingestion_management = IngestionManagementServiceClient()
        self.pubsub               = PubsubManagementServiceClient()
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli = DataProductManagementServiceClient(node=self.container.node)
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubcli =  PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        datastore_name = CACHE_DATASTORE_NAME
        self.db = self.container.datastore_manager.get_datastore(datastore_name)
        self.stream_def_id = self.pubsubcli.create_stream_definition(name='SBE37_CDM')

        self.process_definitions  = {}
        ingestion_worker_definition = ProcessDefinition(name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class' :'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space       = 'science_granule_ingestion'
        self.exchange_point       = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(self.process_definitions['ingestion_worker'],configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)
Пример #20
0
    def on_start(self):
        super(IngestionLauncher, self).on_start()

        exchange_point = self.CFG.get_safe('ingestion.exchange_point',
                                           'science_data')
        couch_opts = self.CFG.get_safe('ingestion.couch_storage', {})
        couch_storage = CouchStorage(**couch_opts)
        hdf_opts = self.CFG.get_safe('ingestion.hdf_storage', {})
        hdf_storage = HdfStorage(**hdf_opts)
        number_of_workers = self.CFG.get_safe('ingestion.number_of_workers', 2)

        ingestion_management_service = IngestionManagementServiceClient(
            node=self.container.node)
        ingestion_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=exchange_point,
            couch_storage=couch_storage,
            hdf_storage=hdf_storage,
            number_of_workers=number_of_workers)
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_id)
Пример #21
0
    def setUp(self):
        self._start_container()
        config = DotDict()
        config.bootstrap.processes.ingestion.module = 'ion.processes.data.ingestion.ingestion_worker_a'
        config.bootstrap.processes.replay.module    = 'ion.processes.data.replay.replay_process_a'
        self.container.start_rel_from_url('res/deploy/r2dm.yml', config)


        self.datastore_name = 'test_datasets'
        self.pubsub_management    = PubsubManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()
    def setUp(self):
        # Start container
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubclient =  PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(node=self.container.node)
        self.dataproductclient = DataProductManagementServiceClient(node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(node=self.container.node)
        self.datasetclient =  DatasetManagementServiceClient(node=self.container.node)
class ScienceGranuleIngestionIntTest(IonIntegrationTestCase):
    def setUp(self):
        self.datastore_name = 'datasets'
        self.exchange_point = 'science_data'
        self.exchange_space = 'science_granule_ingestion'
        self.queue_name     = self.exchange_space
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2dm.yml')

        self.ingestion_management = IngestionManagementServiceClient()
        self.pubsub               = PubsubManagementServiceClient()


    def build_granule(self):
        return ScienceGranuleIngestionWorkerUnitTest.build_granule()

    def launch_worker(self):
        cfg = DotDict()

        cfg.process.datastore_name = self.datastore_name
        cfg.process.queue_name = self.queue_name

        #@todo: replace with CEI friendly calls

        pid = self.container.spawn_process('ingest_worker', 'ion.processes.data.ingestion.science_granule_ingestion_worker','ScienceGranuleIngestionWorker',cfg)
        return pid

    def create_ingestion_config(self):
        ingest_queue = IngestionQueue(name=self.exchange_space, type='science_granule')
        config_id = self.ingestion_management.create_ingestion_configuration(name='standard_ingest', exchange_point_id=self.exchange_point, queues=[ingest_queue])
        return config_id
            
    def create_stream(self):
        stream_id = self.pubsub.create_stream()
        return stream_id

    def poll(self, evaluation_callback,  *args, **kwargs):
        now = time.time()
        cutoff = now + 5
        done = False
        while not done:
            if evaluation_callback(*args,**kwargs):
                done = True
            if now >= cutoff:
                raise Timeout('No results found within the allotted time')
            now = time.time()
        return True
Пример #24
0
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        log.debug("Start rel from url")
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.DPMS = DataProductManagementServiceClient()
        self.RR = ResourceRegistryServiceClient()
        self.RR2 = EnhancedResourceRegistryClient(self.RR)
        self.DAMS = DataAcquisitionManagementServiceClient()
        self.PSMS = PubsubManagementServiceClient()
        self.ingestclient = IngestionManagementServiceClient()
        self.PD = ProcessDispatcherServiceClient()
        self.DSMS = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------
        log.debug("get datastore")
        datastore_name = CACHE_DATASTORE_NAME
        self.db = self.container.datastore_manager.get_datastore(
            datastore_name)
        self.stream_def_id = self.PSMS.create_stream_definition(
            name='SBE37_CDM')

        self.process_definitions = {}
        ingestion_worker_definition = ProcessDefinition(
            name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':
            'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class': 'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.PD.create_process_definition(
            process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        self.addCleanup(self.cleaning_up)
    def setUp(self):
        # Start container
        self._start_container()

        # self.container.start_rel_from_url('res/deploy/r2deploy.yml')
        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        print "started services"

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.imsclient = InstrumentManagementServiceClient(node=self.container.node)
        self.dataproductclient = DataProductManagementServiceClient(node=self.container.node)
        self.dataprocessclient = DataProcessManagementServiceClient(node=self.container.node)
        self.datasetclient = DatasetManagementServiceClient(node=self.container.node)
        self.omsclient = ObservatoryManagementServiceClient(node=self.container.node)
Пример #26
0
    def setUp(self):
        # Start container
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        log.debug("TestExternalDatasetAgentMgmt: started services")

        # Now create client to DataProductManagementService
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.pubsubcli = PubsubManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.dpclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.datasetclient = DatasetManagementServiceClient(
            node=self.container.node)
Пример #27
0
    def setUp(self):  # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever = DataRetrieverServiceClient()
        self.pids = []
        self.event = Event()
        self.exchange_space_name = "test_granules"
        self.exchange_point_name = "science_data"
        self.i = 0

        self.purge_queues()
        self.queue_buffer = []
        self.streams = []
        self.addCleanup(self.stop_all_ingestion)
Пример #28
0
class TestDataProductManagementServiceIntegration(IonIntegrationTestCase):
    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli = DataProductManagementServiceClient()
        self.rrclient = ResourceRegistryServiceClient()
        self.damsclient = DataAcquisitionManagementServiceClient()
        self.pubsubcli = PubsubManagementServiceClient()
        self.ingestclient = IngestionManagementServiceClient()
        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        self.stream_def_id = self.pubsubcli.create_stream_definition(
            name='SBE37_CDM')

        self.process_definitions = {}
        ingestion_worker_definition = ProcessDefinition(
            name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':
            'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class': 'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(
            process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space = 'science_granule_ingestion'
        self.exchange_point = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(
            self.process_definitions['ingestion_worker'], configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)

    def cleaning_up(self):
        for pid in self.pids:
            log.debug("number of pids to be terminated: %s", len(self.pids))
            try:
                self.process_dispatcher.cancel_process(pid)
                log.debug("Terminated the process: %s", pid)
            except:
                log.debug("could not terminate the process id: %s" % pid)
        IngestionManagementIntTest.clean_subscriptions()

        for xn in self.exchange_names:
            xni = self.container.ex_manager.create_xn_queue(xn)
            xni.delete()
        for xp in self.exchange_points:
            xpi = self.container.ex_manager.create_xp(xp)
            xpi.delete()

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(
            datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore

    @attr('EXT')
    @attr('PREP')
    def test_create_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        parameter_dictionary = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict')
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(
            name='Simulated CTD data',
            parameter_dictionary_id=parameter_dictionary._id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------

        dp_obj = IonObject(RT.DataProduct,
                           name='DP1',
                           description='some new dp')

        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 10.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -10.0
        dp_obj.ooi_product_name = "PRODNAME"

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(
            data_product=dp_obj, stream_definition_id=ctd_stream_def_id)
        # Assert that the data product has an associated stream at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStream,
                                                   RT.Stream, True)
        self.assertNotEquals(len(stream_ids), 0)

        # Assert that the data product has an associated stream def at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id,
                                                   PRED.hasStreamDefinition,
                                                   RT.StreamDefinition, True)
        self.assertNotEquals(len(stream_ids), 0)

        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Created data product %s', dp_obj)
        #------------------------------------------------------------------------------------------------
        # test creating a new data product with  a stream definition
        #------------------------------------------------------------------------------------------------
        log.debug('Creating new data product with a stream definition')
        dp_obj = IonObject(RT.DataProduct,
                           name='DP2',
                           description='some new dp')

        dp_id2 = self.dpsc_cli.create_data_product(dp_obj, ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id2)
        log.debug('new dp_id = %s' % dp_id2)

        #------------------------------------------------------------------------------------------------
        #make sure data product is associated with stream def
        #------------------------------------------------------------------------------------------------
        streamdefs = []
        streams, _ = self.rrclient.find_objects(dp_id2, PRED.hasStream,
                                                RT.Stream, True)
        for s in streams:
            log.debug("Checking stream %s" % s)
            sdefs, _ = self.rrclient.find_objects(s, PRED.hasStreamDefinition,
                                                  RT.StreamDefinition, True)
            for sd in sdefs:
                log.debug("Checking streamdef %s" % sd)
                streamdefs.append(sd)
        self.assertIn(ctd_stream_def_id, streamdefs)

        group_names = self.dpsc_cli.get_data_product_group_list()
        self.assertIn("PRODNAME", group_names)

        # test reading a non-existent data product
        log.debug('reading non-existent data product')

        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product('some_fake_id')

        # update a data product (tests read also)
        log.debug('Updating data product')
        # first get the existing dp object
        dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # now tweak the object
        dp_obj.description = 'the very first dp'
        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 20.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -20.0
        # now write the dp back to the registry
        update_result = self.dpsc_cli.update_data_product(dp_obj)

        # now get the dp back to see if it was updated
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertEquals(dp_obj.description, 'the very first dp')
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Updated data product %s', dp_obj)

        #test extension
        extended_product = self.dpsc_cli.get_data_product_extension(dp_id)
        self.assertEqual(dp_id, extended_product._id)
        self.assertEqual(
            ComputedValueAvailability.PROVIDED,
            extended_product.computed.product_download_size_estimated.status)
        self.assertEqual(
            0, extended_product.computed.product_download_size_estimated.value)

        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.parameters.status)

        #log.debug("test_create_data_product: parameters %s" % extended_product.computed.parameters.value)

        def ion_object_encoder(obj):
            return obj.__dict__

        #test prepare for create
        data_product_data = self.dpsc_cli.prepare_data_product_support()

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, "")
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(
            len(data_product_data.associations['StreamDefinition'].resources),
            2)
        self.assertEqual(
            len(data_product_data.associations['Dataset'].resources), 0)
        self.assertEqual(
            len(data_product_data.associations['StreamDefinition'].
                associated_resources), 0)
        self.assertEqual(
            len(data_product_data.associations['Dataset'].associated_resources
                ), 0)

        #test prepare for update
        data_product_data = self.dpsc_cli.prepare_data_product_support(dp_id)

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, dp_id)
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(
            len(data_product_data.associations['StreamDefinition'].resources),
            2)

        self.assertEqual(
            len(data_product_data.associations['Dataset'].resources), 1)

        self.assertEqual(
            len(data_product_data.associations['StreamDefinition'].
                associated_resources), 1)
        self.assertEqual(
            data_product_data.associations['StreamDefinition'].
            associated_resources[0].s, dp_id)

        self.assertEqual(
            len(data_product_data.associations['Dataset'].associated_resources
                ), 1)
        self.assertEqual(
            data_product_data.associations['Dataset'].associated_resources[0].
            s, dp_id)

        # now 'delete' the data product
        log.debug("deleting data product: %s" % dp_id)
        self.dpsc_cli.delete_data_product(dp_id)

        # Assert that there are no associated streams leftover after deleting the data product
        stream_ids, assoc_ids = self.rrclient.find_objects(
            dp_id, PRED.hasStream, RT.Stream, True)
        self.assertEquals(len(stream_ids), 0)
        self.assertEquals(len(assoc_ids), 0)

        self.dpsc_cli.force_delete_data_product(dp_id)

        # now try to get the deleted dp object
        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # Get the events corresponding to the data product
        ret = self.unsc.get_recent_events(resource_id=dp_id)
        events = ret.value

        for event in events:
            log.debug("event time: %s" % event.ts_created)

        self.assertTrue(len(events) > 0)

    def test_data_product_stream_def(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(
            name='Simulated CTD data', parameter_dictionary_id=pdict_id)

        dp_obj = IonObject(RT.DataProduct,
                           name='DP1',
                           description='some new dp')
        dp_id = self.dpsc_cli.create_data_product(
            data_product=dp_obj, stream_definition_id=ctd_stream_def_id)

        stream_def_id = self.dpsc_cli.get_data_product_stream_definition(dp_id)
        self.assertEquals(ctd_stream_def_id, stream_def_id)

    def test_derived_data_product(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(
            name='ctd parsed', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition,
                        ctd_stream_def_id)

        dp = DataProduct(name='Instrument DP')
        dp_id = self.dpsc_cli.create_data_product(
            dp, stream_definition_id=ctd_stream_def_id)
        self.addCleanup(self.dpsc_cli.force_delete_data_product, dp_id)

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        self.addCleanup(self.dpsc_cli.suspend_data_product_persistence, dp_id)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id,
                                                    predicate=PRED.hasDataset,
                                                    id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" %
                           str(dp_id))
        dataset_id = dataset_ids[0]

        # Make the derived data product
        simple_stream_def_id = self.pubsubcli.create_stream_definition(
            name='TEMPWAT stream def',
            parameter_dictionary_id=pdict_id,
            available_fields=['time', 'temp'])
        tempwat_dp = DataProduct(name='TEMPWAT',
                                 category=DataProductTypeEnum.DERIVED)
        tempwat_dp_id = self.dpsc_cli.create_data_product(
            tempwat_dp,
            stream_definition_id=simple_stream_def_id,
            parent_data_product_id=dp_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, tempwat_dp_id)
        # Check that the streams associated with the data product are persisted with
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStream,
                                                   RT.Stream, True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)
        rdt['pressure'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id, route)

        dataset_modified = Event()

        def cb(*args, **kwargs):
            dataset_modified.set()

        es = EventSubscriber(event_type=OT.DatasetModified,
                             callback=cb,
                             origin=dataset_id,
                             auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        tempwat_dataset_ids, _ = self.rrclient.find_objects(tempwat_dp_id,
                                                            PRED.hasDataset,
                                                            id_only=True)
        tempwat_dataset_id = tempwat_dataset_ids[0]
        granule = self.data_retriever.retrieve(
            tempwat_dataset_id, delivery_format=simple_stream_def_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['time'], np.arange(20))
        self.assertEquals(set(rdt.fields), set(['time', 'temp']))

    def test_activate_suspend_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(
            name='Simulated CTD data', parameter_dictionary_id=pdict_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------
        # Construct temporal and spatial Coordinate Reference System objects

        dp_obj = IonObject(RT.DataProduct,
                           name='DP1',
                           description='some new dp')

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(
            data_product=dp_obj, stream_definition_id=ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # Subscribe to persist events
        #------------------------------------------------------------------------------------------------
        queue = gevent.queue.Queue()

        def info_event_received(message, headers):
            queue.put(message)

        es = EventSubscriber(event_type=OT.InformationContentStatusEvent,
                             callback=info_event_received,
                             origin=dp_id,
                             auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        #------------------------------------------------------------------------------------------------
        # test activate and suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id,
                                                    predicate=PRED.hasDataset,
                                                    id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" %
                           str(dp_id))
        dataset_id = dataset_ids[0]

        # Check that the streams associated with the data product are persisted with
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStream,
                                                   RT.Stream, True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id, route)

        dataset_modified = Event()

        def cb(*args, **kwargs):
            dataset_modified.set()

        es = EventSubscriber(event_type=OT.DatasetModified,
                             callback=cb,
                             origin=dataset_id,
                             auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_ids[0])
        self.assertIsInstance(replay_data, Granule)

        log.debug(
            "The data retriever was able to replay the dataset that was attached to the data product "
            "we wanted to be persisted. Therefore the data product was indeed persisted with "
            "otherwise we could not have retrieved its dataset using the data retriever. Therefore "
            "this demonstration shows that L4-CI-SA-RQ-267 is satisfied: 'Data product management shall persist data products'"
        )

        data_product_object = self.rrclient.read(dp_id)
        self.assertEquals(data_product_object.name, 'DP1')
        self.assertEquals(data_product_object.description, 'some new dp')

        log.debug(
            "Towards L4-CI-SA-RQ-308: 'Data product management shall persist data product metadata'. "
            " Attributes in create for the data product obj, name= '%s', description='%s', match those of object from the "
            "resource registry, name='%s', desc='%s'" %
            (dp_obj.name, dp_obj.description, data_product_object.name,
             data_product_object.description))

        #------------------------------------------------------------------------------------------------
        # test suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.suspend_data_product_persistence(dp_id)

        dataset_modified.clear()

        rdt['time'] = np.arange(20, 40)

        publisher.publish(rdt.to_granule())
        self.assertFalse(dataset_modified.wait(2))

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        dataset_modified.clear()

        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_modified.wait(30))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_almost_equal(rdt['time'], np.arange(40))

        dataset_ids, _ = self.rrclient.find_objects(dp_id,
                                                    PRED.hasDataset,
                                                    id_only=True)
        self.assertEquals(len(dataset_ids), 1)

        self.dpsc_cli.suspend_data_product_persistence(dp_id)
        self.dpsc_cli.force_delete_data_product(dp_id)
        # now try to get the deleted dp object

        with self.assertRaises(NotFound):
            dp_obj = self.rrclient.read(dp_id)

        info_event_counter = 0
        runtime = 0
        starttime = time.time()
        caught_events = []

        #check that the four InfoStatusEvents were received
        while info_event_counter < 4 and runtime < 60:
            a = queue.get(timeout=60)
            caught_events.append(a)
            info_event_counter += 1
            runtime = time.time() - starttime

        self.assertEquals(info_event_counter, 4)
Пример #29
0
    def on_start(self):

        log.debug("VizStreamProducer start")
        self.data_source_name = self.CFG.get('name')
        self.dataset = self.CFG.get('dataset')

        # create a pubsub client and a resource registry client
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)

        # Dummy instrument related clients
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.dpclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.IngestClient = IngestionManagementServiceClient(
            node=self.container.node)

        # create the pubsub client
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)

        # Additional code for creating a dummy instrument
        """
        # Set up the preconditions. Look for an existing ingestion config
        while True:
            log.info("VisStreamLauncher:on_start: Waiting for an ingestion configuration to be available.")
            ingestion_cfgs, _ = self.rrclient.find_resources(RT.IngestionConfiguration, None, None, True)

            if len(ingestion_cfgs) > 0:
                break
            else:
                gevent.sleep(1)
        """

        # Check to see if the data_product already exists in the system (for e.g re launching the code after a crash)
        dp_ids, _ = self.rrclient.find_resources(RT.DataProduct, None,
                                                 self.data_source_name, True)
        if len(dp_ids) > 0:
            data_product_id = dp_ids[0]
            print '>>>>>>>>>>>>> Found dp_id = ', data_product_id
        else:
            # Create InstrumentModel
            instModel_obj = IonObject(RT.InstrumentModel,
                                      name=self.data_source_name,
                                      description=self.data_source_name,
                                      model_label=self.data_source_name)
            instModel_id = self.imsclient.create_instrument_model(
                instModel_obj)

            # Create InstrumentDevice
            instDevice_obj = IonObject(RT.InstrumentDevice,
                                       name=self.data_source_name,
                                       description=self.data_source_name,
                                       serial_number="12345")

            instDevice_id = self.imsclient.create_instrument_device(
                instrument_device=instDevice_obj)
            self.imsclient.assign_instrument_model_to_instrument_device(
                instModel_id, instDevice_id)

            # create a stream definition for the data from the ctd simulator
            ctd_stream_def = SBE37_CDM_stream_definition()
            ctd_stream_def_id = self.pubsubclient.create_stream_definition(
                container=ctd_stream_def)

            print 'Creating new CDM data product with a stream definition'
            dp_obj = IonObject(RT.DataProduct,
                               name=self.data_source_name,
                               description='ctd stream test')
            data_product_id = self.dpclient.create_data_product(
                dp_obj, ctd_stream_def_id)

            self.damsclient.assign_data_product(
                input_resource_id=instDevice_id,
                data_product_id=data_product_id)
            self.dpclient.activate_data_product_persistence(
                data_product_id=data_product_id,
                persist_data=True,
                persist_metadata=True)

            print '>>>>>>>>>>>> New dp_id = ', data_product_id

        # Retrieve the id of the OUTPUT stream from the out Data Product
        stream_ids, _ = self.rrclient.find_objects(data_product_id,
                                                   PRED.hasStream, None, True)

        if self.dataset == 'sinusoidal':
            pid = self.container.spawn_process(
                name='ctd_test.' + self.data_source_name,
                module='ion.processes.data.sinusoidal_stream_publisher',
                cls='SinusoidalCtdPublisher',
                config={'process': {
                    'stream_id': stream_ids[0]
                }})
        else:
            pid = self.container.spawn_process(
                name='ctd_test.' + self.data_source_name,
                module='ion.processes.data.ctd_stream_publisher',
                cls='SimpleCtdPublisher',
                config={'process': {
                    'stream_id': stream_ids[0]
                }})
class TestDataProductManagementServiceIntegration(IonIntegrationTestCase):

    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli           = DataProductManagementServiceClient()
        self.rrclient           = ResourceRegistryServiceClient()
        self.damsclient         = DataAcquisitionManagementServiceClient()
        self.pubsubcli          = PubsubManagementServiceClient()
        self.ingestclient       = IngestionManagementServiceClient()
        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc               = UserNotificationServiceClient()
        self.data_retriever     = DataRetrieverServiceClient()
        self.identcli           = IdentityManagementServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        self.stream_def_id = self.pubsubcli.create_stream_definition(name='SBE37_CDM')

        self.process_definitions  = {}
        ingestion_worker_definition = ProcessDefinition(name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class' :'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space       = 'science_granule_ingestion'
        self.exchange_point       = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(self.process_definitions['ingestion_worker'],configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)

    def cleaning_up(self):
        for pid in self.pids:
            log.debug("number of pids to be terminated: %s", len(self.pids))
            try:
                self.process_dispatcher.cancel_process(pid)
                log.debug("Terminated the process: %s", pid)
            except:
                log.debug("could not terminate the process id: %s" % pid)
        IngestionManagementIntTest.clean_subscriptions()

        for xn in self.exchange_names:
            xni = self.container.ex_manager.create_xn_queue(xn)
            xni.delete()
        for xp in self.exchange_points:
            xpi = self.container.ex_manager.create_xp(xp)
            xpi.delete()

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore


    @attr('EXT')
    @attr('PREP')
    def test_create_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        parameter_dictionary = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict')
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=parameter_dictionary._id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------




        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp')

        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 10.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -10.0
        dp_obj.ooi_product_name = "PRODNAME"

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product( data_product= dp_obj,
                                            stream_definition_id=ctd_stream_def_id)
        # Assert that the data product has an associated stream at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStream, RT.Stream, True)
        self.assertNotEquals(len(stream_ids), 0)

        # Assert that the data product has an associated stream def at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStreamDefinition, RT.StreamDefinition, True)
        self.assertNotEquals(len(stream_ids), 0)

        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Created data product %s', dp_obj)
        #------------------------------------------------------------------------------------------------
        # test creating a new data product with  a stream definition
        #------------------------------------------------------------------------------------------------
        log.debug('Creating new data product with a stream definition')
        dp_obj = IonObject(RT.DataProduct,
            name='DP2',
            description='some new dp')

        dp_id2 = self.dpsc_cli.create_data_product(dp_obj, ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id2)
        log.debug('new dp_id = %s' % dp_id2)

        #------------------------------------------------------------------------------------------------
        #make sure data product is associated with stream def
        #------------------------------------------------------------------------------------------------
        streamdefs = []
        streams, _ = self.rrclient.find_objects(dp_id2, PRED.hasStream, RT.Stream, True)
        for s in streams:
            log.debug("Checking stream %s" % s)
            sdefs, _ = self.rrclient.find_objects(s, PRED.hasStreamDefinition, RT.StreamDefinition, True)
            for sd in sdefs:
                log.debug("Checking streamdef %s" % sd)
                streamdefs.append(sd)
        self.assertIn(ctd_stream_def_id, streamdefs)

        group_names = self.dpsc_cli.get_data_product_group_list()
        self.assertIn("PRODNAME", group_names)


        #----------------------------------------------------------------------------------------
        # Create users then notifications to this data product for each user
        #----------------------------------------------------------------------------------------

        # user_1
        user_1 = UserInfo()
        user_1.name = 'user_1'
        user_1.contact.email = '*****@*****.**'

        # user_2
        user_2 = UserInfo()
        user_2.name = 'user_2'
        user_2.contact.email = '*****@*****.**'
        #user1 is a complete user
        self.subject = "/DC=org/DC=cilogon/C=US/O=ProtectNetwork/CN=Roger Unwin A254"
        actor_identity_obj = IonObject("ActorIdentity", {"name": self.subject})
        actor_id = self.identcli.create_actor_identity(actor_identity_obj)

        user_credentials_obj = IonObject("UserCredentials", {"name": self.subject})
        self.identcli.register_user_credentials(actor_id, user_credentials_obj)
        user_id_1 = self.identcli.create_user_info(actor_id, user_1)
        user_id_2, _ = self.rrclient.create(user_2)

        delivery_config1a = IonObject(OT.DeliveryConfiguration, email='*****@*****.**', mode=DeliveryModeEnum.EMAIL, frequency=NotificationFrequencyEnum.BATCH)
        delivery_config1b = IonObject(OT.DeliveryConfiguration, email='*****@*****.**', mode=DeliveryModeEnum.EMAIL, frequency=NotificationFrequencyEnum.BATCH)
        notification_request_1 = NotificationRequest(   name = "notification_1",
            origin=dp_id,
            origin_type="type_1",
            event_type=OT.ResourceLifecycleEvent,
            disabled_by_system = False,
            delivery_configurations=[delivery_config1a, delivery_config1b])

        delivery_config2a = IonObject(OT.DeliveryConfiguration, email='*****@*****.**', mode=DeliveryModeEnum.EMAIL, frequency=NotificationFrequencyEnum.BATCH)
        delivery_config2b = IonObject(OT.DeliveryConfiguration, email='*****@*****.**', mode=DeliveryModeEnum.EMAIL, frequency=NotificationFrequencyEnum.BATCH)
        notification_request_2 = NotificationRequest(   name = "notification_2",
            origin=dp_id,
            origin_type="type_2",
            disabled_by_system = False,
            event_type=OT.DetectionEvent,
            delivery_configurations=[delivery_config2a, delivery_config2b])

        notification_request_1_id = self.unsc.create_notification(notification=notification_request_1, user_id=user_id_1)
        notification_request_2_id = self.unsc.create_notification(notification=notification_request_2, user_id=user_id_2)
        self.unsc.delete_notification(notification_request_1_id)



        # test reading a non-existent data product
        log.debug('reading non-existent data product')

        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product('some_fake_id')

        # update a data product (tests read also)
        log.debug('Updating data product')
        # first get the existing dp object
        dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # now tweak the object
        dp_obj.description = 'the very first dp'
        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 20.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -20.0
        # now write the dp back to the registry
        update_result = self.dpsc_cli.update_data_product(dp_obj)


        # now get the dp back to see if it was updated
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertEquals(dp_obj.description,'the very first dp')
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Updated data product %s', dp_obj)

        #test extension
        extended_product = self.dpsc_cli.get_data_product_extension(dp_id)
        #validate that there is one active and one retired user notification for this data product
        self.assertEqual(1, len(extended_product.computed.active_user_subscriptions.value))
        self.assertEqual(1, len(extended_product.computed.past_user_subscriptions.value))

        self.assertEqual(dp_id, extended_product._id)
        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.product_download_size_estimated.status)
        self.assertEqual(0, extended_product.computed.product_download_size_estimated.value)

        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.parameters.status)
        #log.debug("test_create_data_product: parameters %s" % extended_product.computed.parameters.value)


        def ion_object_encoder(obj):
            return obj.__dict__


        #test prepare for create
        data_product_data = self.dpsc_cli.prepare_data_product_support()

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, "")
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].resources), 2)
        self.assertEqual(len(data_product_data.associations['Dataset'].resources), 0)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].associated_resources), 0)
        self.assertEqual(len(data_product_data.associations['Dataset'].associated_resources), 0)

        #test prepare for update
        data_product_data = self.dpsc_cli.prepare_data_product_support(dp_id)

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, dp_id)
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].resources), 2)

        self.assertEqual(len(data_product_data.associations['Dataset'].resources), 1)

        self.assertEqual(len(data_product_data.associations['StreamDefinition'].associated_resources), 1)
        self.assertEqual(data_product_data.associations['StreamDefinition'].associated_resources[0].s, dp_id)

        self.assertEqual(len(data_product_data.associations['Dataset'].associated_resources), 1)
        self.assertEqual(data_product_data.associations['Dataset'].associated_resources[0].s, dp_id)

        # now 'delete' the data product
        log.debug("deleting data product: %s" % dp_id)
        self.dpsc_cli.delete_data_product(dp_id)

        # Assert that there are no associated streams leftover after deleting the data product
        stream_ids, assoc_ids = self.rrclient.find_objects(dp_id, PRED.hasStream, RT.Stream, True)
        self.assertEquals(len(stream_ids), 0)
        self.assertEquals(len(assoc_ids), 0)

        self.dpsc_cli.force_delete_data_product(dp_id)

        # now try to get the deleted dp object
        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # Get the events corresponding to the data product
        ret = self.unsc.get_recent_events(resource_id=dp_id)
        events = ret.value

        for event in events:
            log.debug("event time: %s" % event.ts_created)

        self.assertTrue(len(events) > 0)

    def test_data_product_stream_def(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)


        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp')
        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        stream_def_id = self.dpsc_cli.get_data_product_stream_definition(dp_id)
        self.assertEquals(ctd_stream_def_id, stream_def_id)


    def test_derived_data_product(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='ctd parsed', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition, ctd_stream_def_id)


        dp = DataProduct(name='Instrument DP')
        dp_id = self.dpsc_cli.create_data_product(dp, stream_definition_id=ctd_stream_def_id)
        self.addCleanup(self.dpsc_cli.force_delete_data_product, dp_id)

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        self.addCleanup(self.dpsc_cli.suspend_data_product_persistence, dp_id)


        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        dataset_id = dataset_ids[0]
        
        # Make the derived data product
        simple_stream_def_id = self.pubsubcli.create_stream_definition(name='TEMPWAT stream def', parameter_dictionary_id=pdict_id, available_fields=['time','temp'])
        tempwat_dp = DataProduct(name='TEMPWAT', category=DataProductTypeEnum.DERIVED)
        tempwat_dp_id = self.dpsc_cli.create_data_product(tempwat_dp, stream_definition_id=simple_stream_def_id, parent_data_product_id=dp_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, tempwat_dp_id)
        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)
        rdt['pressure'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id,route)
        
        dataset_modified = Event()
        def cb(*args, **kwargs):
            dataset_modified.set()
        es = EventSubscriber(event_type=OT.DatasetModified, callback=cb, origin=dataset_id, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        tempwat_dataset_ids, _ = self.rrclient.find_objects(tempwat_dp_id, PRED.hasDataset, id_only=True)
        tempwat_dataset_id = tempwat_dataset_ids[0]
        granule = self.data_retriever.retrieve(tempwat_dataset_id, delivery_format=simple_stream_def_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['time'], np.arange(20))
        self.assertEquals(set(rdt.fields), set(['time','temp']))


    def test_activate_suspend_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------
        # Construct temporal and spatial Coordinate Reference System objects

        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp')

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # Subscribe to persist events
        #------------------------------------------------------------------------------------------------
        queue = gevent.queue.Queue()

        def info_event_received(message, headers):
            queue.put(message)

        es = EventSubscriber(event_type=OT.InformationContentStatusEvent, callback=info_event_received, origin=dp_id, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)


        #------------------------------------------------------------------------------------------------
        # test activate and suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.activate_data_product_persistence(dp_id)
        
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        dataset_id = dataset_ids[0]


        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id,route)
        
        dataset_modified = Event()
        def cb(*args, **kwargs):
            dataset_modified.set()
        es = EventSubscriber(event_type=OT.DatasetModified, callback=cb, origin=dataset_id, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_ids[0])
        self.assertIsInstance(replay_data, Granule)

        log.debug("The data retriever was able to replay the dataset that was attached to the data product "
                  "we wanted to be persisted. Therefore the data product was indeed persisted with "
                  "otherwise we could not have retrieved its dataset using the data retriever. Therefore "
                  "this demonstration shows that L4-CI-SA-RQ-267 is satisfied: 'Data product management shall persist data products'")

        data_product_object = self.rrclient.read(dp_id)
        self.assertEquals(data_product_object.name,'DP1')
        self.assertEquals(data_product_object.description,'some new dp')

        log.debug("Towards L4-CI-SA-RQ-308: 'Data product management shall persist data product metadata'. "
                  " Attributes in create for the data product obj, name= '%s', description='%s', match those of object from the "
                  "resource registry, name='%s', desc='%s'" % (dp_obj.name, dp_obj.description,data_product_object.name,
                                                           data_product_object.description))

        #------------------------------------------------------------------------------------------------
        # test suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.suspend_data_product_persistence(dp_id)


        dataset_modified.clear()

        rdt['time'] = np.arange(20,40)

        publisher.publish(rdt.to_granule())
        self.assertFalse(dataset_modified.wait(2))

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        dataset_modified.clear()

        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_modified.wait(30))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_almost_equal(rdt['time'], np.arange(40))


        dataset_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasDataset, id_only=True)
        self.assertEquals(len(dataset_ids), 1)

        self.dpsc_cli.suspend_data_product_persistence(dp_id)
        self.dpsc_cli.force_delete_data_product(dp_id)
        # now try to get the deleted dp object

        with self.assertRaises(NotFound):
            dp_obj = self.rrclient.read(dp_id)


        info_event_counter = 0
        runtime = 0
        starttime = time.time()
        caught_events = []

        #check that the four InfoStatusEvents were received
        while info_event_counter < 4 and runtime < 60 :
            a = queue.get(timeout=60)
            caught_events.append(a)
            info_event_counter += 1
            runtime = time.time() - starttime

        self.assertEquals(info_event_counter, 4)
    def test_usgs_integration(self):
        '''
        test_usgs_integration
        Test full DM Services Integration using usgs
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_usgs_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        usgs_stream_def = USGS_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(2):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'eoi.agent.handler.usgs_stream_publisher',
                'class':'UsgsPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
            # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP,'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1
        )

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id
        )

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition
        )
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id
        )

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id'
        )
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id = ingestion_configuration_id
        )
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields=[
            'conductivity',
            'height',
            'latitude',
            'longitude',
            'pressure',
            'temperature',
            'time'
        ]

        input_vectors = acquire_data([input_file_path],fields , 2).next()

        producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id)
        ar = gevent.event.AsyncResult()
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
Пример #33
0
 def setUp(self):
     # Start container
     self._start_container()
     self.container.start_rel_from_url('res/deploy/r2deploy.yml')
     self.ingestion_management = IngestionManagementServiceClient()
     self.rr = self.container.resource_registry
class TestDataProductManagementServiceIntegration(IonIntegrationTestCase):

    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli           = DataProductManagementServiceClient()
        self.rrclient           = ResourceRegistryServiceClient()
        self.damsclient         = DataAcquisitionManagementServiceClient()
        self.pubsubcli          = PubsubManagementServiceClient()
        self.ingestclient       = IngestionManagementServiceClient()
        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc               = UserNotificationServiceClient()
        self.data_retriever     = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        datastore_name = CACHE_DATASTORE_NAME
        self.db = self.container.datastore_manager.get_datastore(datastore_name)
        self.stream_def_id = self.pubsubcli.create_stream_definition(name='SBE37_CDM')

        self.process_definitions  = {}
        ingestion_worker_definition = ProcessDefinition(name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class' :'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space       = 'science_granule_ingestion'
        self.exchange_point       = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(self.process_definitions['ingestion_worker'],configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)

    def cleaning_up(self):
        for pid in self.pids:
            log.debug("number of pids to be terminated: %s", len(self.pids))
            try:
                self.process_dispatcher.cancel_process(pid)
                log.debug("Terminated the process: %s", pid)
            except:
                log.debug("could not terminate the process id: %s" % pid)
        IngestionManagementIntTest.clean_subscriptions()

        for xn in self.exchange_names:
            xni = self.container.ex_manager.create_xn_queue(xn)
            xni.delete()
        for xp in self.exchange_points:
            xpi = self.container.ex_manager.create_xp(xp)
            xpi.delete()

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore


    @attr('EXT')
    @attr('PREP')
    def test_create_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        parameter_dictionary = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict')
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=parameter_dictionary._id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------

        # Generic time-series data domain creation
        tdom, sdom = time_series_domain()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom.dump(), 
            spatial_domain = sdom.dump())

        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 10.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 10.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -10.0
        dp_obj.ooi_product_name = "PRODNAME"

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product( data_product= dp_obj,
                                            stream_definition_id=ctd_stream_def_id)
        # Assert that the data product has an associated stream at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStream, RT.Stream, True)
        self.assertNotEquals(len(stream_ids), 0)

        # Assert that the data product has an associated stream def at this stage
        stream_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasStreamDefinition, RT.StreamDefinition, True)
        self.assertNotEquals(len(stream_ids), 0)

        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Created data product %s', dp_obj)
        #------------------------------------------------------------------------------------------------
        # test creating a new data product with  a stream definition
        #------------------------------------------------------------------------------------------------
        log.debug('Creating new data product with a stream definition')
        dp_obj = IonObject(RT.DataProduct,
            name='DP2',
            description='some new dp',
            temporal_domain = tdom.dump(),
            spatial_domain = sdom.dump())

        dp_id2 = self.dpsc_cli.create_data_product(dp_obj, ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id2)
        log.debug('new dp_id = %s' % dp_id2)

        #------------------------------------------------------------------------------------------------
        #make sure data product is associated with stream def
        #------------------------------------------------------------------------------------------------
        streamdefs = []
        streams, _ = self.rrclient.find_objects(dp_id2, PRED.hasStream, RT.Stream, True)
        for s in streams:
            log.debug("Checking stream %s" % s)
            sdefs, _ = self.rrclient.find_objects(s, PRED.hasStreamDefinition, RT.StreamDefinition, True)
            for sd in sdefs:
                log.debug("Checking streamdef %s" % sd)
                streamdefs.append(sd)
        self.assertIn(ctd_stream_def_id, streamdefs)

        group_names = self.dpsc_cli.get_data_product_group_list()
        self.assertIn("PRODNAME", group_names)


        # test reading a non-existent data product
        log.debug('reading non-existent data product')

        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product('some_fake_id')

        # update a data product (tests read also)
        log.debug('Updating data product')
        # first get the existing dp object
        dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # now tweak the object
        dp_obj.description = 'the very first dp'
        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 20.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = -20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 20.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = -20.0
        # now write the dp back to the registry
        update_result = self.dpsc_cli.update_data_product(dp_obj)


        # now get the dp back to see if it was updated
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertEquals(dp_obj.description,'the very first dp')
        self.assertEquals(dp_obj.geospatial_point_center.lat, 0.0)
        log.debug('Updated data product %s', dp_obj)

        #test extension
        extended_product = self.dpsc_cli.get_data_product_extension(dp_id)
        self.assertEqual(dp_id, extended_product._id)
        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.product_download_size_estimated.status)
        self.assertEqual(0, extended_product.computed.product_download_size_estimated.value)

        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.parameters.status)
        #log.debug("test_create_data_product: parameters %s" % extended_product.computed.parameters.value)


        def ion_object_encoder(obj):
            return obj.__dict__


        #test prepare for create
        data_product_data = self.dpsc_cli.prepare_data_product_support()

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, "")
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].resources), 2)
        self.assertEqual(len(data_product_data.associations['Dataset'].resources), 0)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].associated_resources), 0)
        self.assertEqual(len(data_product_data.associations['Dataset'].associated_resources), 0)

        #test prepare for update
        data_product_data = self.dpsc_cli.prepare_data_product_support(dp_id)

        #print simplejson.dumps(data_product_data, default=ion_object_encoder, indent= 2)

        self.assertEqual(data_product_data._id, dp_id)
        self.assertEqual(data_product_data.type_, OT.DataProductPrepareSupport)
        self.assertEqual(len(data_product_data.associations['StreamDefinition'].resources), 2)

        self.assertEqual(len(data_product_data.associations['Dataset'].resources), 1)

        self.assertEqual(len(data_product_data.associations['StreamDefinition'].associated_resources), 1)
        self.assertEqual(data_product_data.associations['StreamDefinition'].associated_resources[0].s, dp_id)

        self.assertEqual(len(data_product_data.associations['Dataset'].associated_resources), 1)
        self.assertEqual(data_product_data.associations['Dataset'].associated_resources[0].s, dp_id)

        # now 'delete' the data product
        log.debug("deleting data product: %s" % dp_id)
        self.dpsc_cli.delete_data_product(dp_id)

        # Assert that there are no associated streams leftover after deleting the data product
        stream_ids, assoc_ids = self.rrclient.find_objects(dp_id, PRED.hasStream, RT.Stream, True)
        self.assertEquals(len(stream_ids), 0)
        self.assertEquals(len(assoc_ids), 0)

        self.dpsc_cli.force_delete_data_product(dp_id)

        # now try to get the deleted dp object
        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # Get the events corresponding to the data product
        ret = self.unsc.get_recent_events(resource_id=dp_id)
        events = ret.value

        for event in events:
            log.debug("event time: %s" % event.ts_created)

        self.assertTrue(len(events) > 0)

    def test_data_product_stream_def(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)

        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()

        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)
        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        stream_def_id = self.dpsc_cli.get_data_product_stream_definition(dp_id)
        self.assertEquals(ctd_stream_def_id, stream_def_id)


    def test_derived_data_product(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='ctd parsed', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition, ctd_stream_def_id)

        tdom, sdom = time_series_domain()

        dp = DataProduct(name='Instrument DP', temporal_domain=tdom.dump(), spatial_domain=sdom.dump())
        dp_id = self.dpsc_cli.create_data_product(dp, stream_definition_id=ctd_stream_def_id)
        self.addCleanup(self.dpsc_cli.force_delete_data_product, dp_id)

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        self.addCleanup(self.dpsc_cli.suspend_data_product_persistence, dp_id)


        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        dataset_id = dataset_ids[0]
        
        # Make the derived data product
        simple_stream_def_id = self.pubsubcli.create_stream_definition(name='TEMPWAT stream def', parameter_dictionary_id=pdict_id, available_fields=['time','temp'])
        tempwat_dp = DataProduct(name='TEMPWAT')
        tempwat_dp_id = self.dpsc_cli.create_data_product(tempwat_dp, stream_definition_id=simple_stream_def_id, parent_data_product_id=dp_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, tempwat_dp_id)
        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)
        rdt['pressure'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id,route)
        
        dataset_modified = Event()
        def cb(*args, **kwargs):
            dataset_modified.set()
        es = EventSubscriber(event_type=OT.DatasetModified, callback=cb, origin=dataset_id, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        tempwat_dataset_ids, _ = self.rrclient.find_objects(tempwat_dp_id, PRED.hasDataset, id_only=True)
        tempwat_dataset_id = tempwat_dataset_ids[0]
        granule = self.data_retriever.retrieve(tempwat_dataset_id, delivery_format=simple_stream_def_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['time'], np.arange(20))
        self.assertEquals(set(rdt.fields), set(['time','temp']))


    def test_activate_suspend_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------
        # Construct temporal and spatial Coordinate Reference System objects
        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()

        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test activate and suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.activate_data_product_persistence(dp_id)
        
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        dataset_id = dataset_ids[0]


        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=ctd_stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = np.arange(20)

        publisher = StandaloneStreamPublisher(stream_id,route)
        
        dataset_modified = Event()
        def cb(*args, **kwargs):
            dataset_modified.set()
        es = EventSubscriber(event_type=OT.DatasetModified, callback=cb, origin=dataset_id, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())

        self.assertTrue(dataset_modified.wait(30))

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_ids[0])
        self.assertIsInstance(replay_data, Granule)

        log.debug("The data retriever was able to replay the dataset that was attached to the data product "
                  "we wanted to be persisted. Therefore the data product was indeed persisted with "
                  "otherwise we could not have retrieved its dataset using the data retriever. Therefore "
                  "this demonstration shows that L4-CI-SA-RQ-267 is satisfied: 'Data product management shall persist data products'")

        data_product_object = self.rrclient.read(dp_id)
        self.assertEquals(data_product_object.name,'DP1')
        self.assertEquals(data_product_object.description,'some new dp')

        log.debug("Towards L4-CI-SA-RQ-308: 'Data product management shall persist data product metadata'. "
                  " Attributes in create for the data product obj, name= '%s', description='%s', match those of object from the "
                  "resource registry, name='%s', desc='%s'" % (dp_obj.name, dp_obj.description,data_product_object.name,
                                                           data_product_object.description))

        #------------------------------------------------------------------------------------------------
        # test suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.suspend_data_product_persistence(dp_id)


        dataset_modified.clear()

        rdt['time'] = np.arange(20,40)

        publisher.publish(rdt.to_granule())
        self.assertFalse(dataset_modified.wait(2))

        self.dpsc_cli.activate_data_product_persistence(dp_id)
        dataset_modified.clear()

        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_modified.wait(30))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_almost_equal(rdt['time'], np.arange(40))


        dataset_ids, _ = self.rrclient.find_objects(dp_id, PRED.hasDataset, id_only=True)
        self.assertEquals(len(dataset_ids), 1)

        self.dpsc_cli.suspend_data_product_persistence(dp_id)
        self.dpsc_cli.force_delete_data_product(dp_id)
        # now try to get the deleted dp object

        with self.assertRaises(NotFound):
            dp_obj = self.rrclient.read(dp_id)

    def test_lookup_values(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsubcli.create_stream_definition('lookup', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsubcli.delete_stream_definition, stream_def_id)

        data_product = DataProduct(name='lookup data product')
        tdom, sdom = time_series_domain()
        data_product.temporal_domain = tdom.dump()
        data_product.spatial_domain = sdom.dump()

        data_product_id = self.dpsc_cli.create_data_product(data_product, stream_definition_id=stream_def_id)
        self.addCleanup(self.dpsc_cli.delete_data_product, data_product_id)
        data_producer = DataProducer(name='producer')
        data_producer.producer_context = DataProcessProducerContext()
        data_producer.producer_context.configuration['qc_keys'] = ['offset_document']
        data_producer_id, _ = self.rrclient.create(data_producer)
        self.addCleanup(self.rrclient.delete, data_producer_id)
        assoc,_ = self.rrclient.create_association(subject=data_product_id, object=data_producer_id, predicate=PRED.hasDataProducer)
        self.addCleanup(self.rrclient.delete_association, assoc)

        document_keys = self.damsclient.list_qc_references(data_product_id)
            
        self.assertEquals(document_keys, ['offset_document'])
        svm = StoredValueManager(self.container)
        svm.stored_value_cas('offset_document', {'offset_a':2.0})
        self.dpsc_cli.activate_data_product_persistence(data_product_id)
        dataset_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasDataset, id_only=True)
        dataset_id = dataset_ids[0]

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [0]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()

        stream_ids, _ = self.rrclient.find_objects(subject=data_product_id, predicate=PRED.hasStream, id_only=True)
        stream_id = stream_ids[0]
        route = self.pubsubcli.read_stream_route(stream_id=stream_id)

        publisher = StandaloneStreamPublisher(stream_id, route)
        publisher.publish(granule)

        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['temp'], rdt2['temp'])
        np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0]))


        svm.stored_value_cas('updated_document', {'offset_a':3.0})
        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)
        ep = EventPublisher(event_type=OT.ExternalReferencesUpdatedEvent)
        ep.publish_event(origin=data_product_id, reference_keys=['updated_document'])

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [1]
        rdt['temp'] = [20.]
        granule = rdt.to_granule()
        gevent.sleep(2) # Yield so that the event goes through
        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(10))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt2['temp'],np.array([20.,20.]))
        np.testing.assert_array_almost_equal(rdt2['calibrated'], np.array([22.0,23.0]))
    def test_dm_integration(self):
        '''
        test_salinity_transform
        Test full DM Services Integration
        '''
        cc = self.container
        assertions = self.assertTrue


        #-----------------------------
        # Copy below here to run as a script (don't forget the imports of course!)
        #-----------------------------


        # Create some service clients...
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        # declare some handy variables

        datastore_name = 'test_dm_integration'



        ###
        ### In the beginning there were two stream definitions...
        ###
        # create a stream definition for the data from the ctd simulator
        ctd_stream_def = SBE37_CDM_stream_definition()
        ctd_stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Simulated CTD data')

        # create a stream definition for the data from the salinity Transform
        sal_stream_def_id = pubsub_management_service.create_stream_definition(container=SalinityTransform.outgoing_stream_def, name='Scalar Salinity data stream')



        ###
        ### And two process definitions...
        ###
        # one for the ctd simulator...
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module':'ion.processes.data.ctd_stream_publisher',
            'class':'SimpleCtdPublisher'
        }

        ctd_sim_procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)

        # one for the salinity transform
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module':'ion.processes.data.transforms.ctd.ctd_L2_salinity',
            'class':'SalinityTransform'
        }

        salinity_transform_procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)



        #---------------------------
        # Set up ingestion - this is an operator concern - not done by SA in a deployed system
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=1
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)



        #---------------------------
        # Set up the producer (CTD Simulator)
        #---------------------------

        # Create the stream
        ctd_stream_id = pubsub_management_service.create_stream(stream_definition_id=ctd_stream_def_id)


        # Set up the datasets
        ctd_dataset_id = dataset_management_service.create_dataset(
            stream_id=ctd_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule'
        )

        # Configure ingestion of this dataset
        ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id = ctd_dataset_id,
            archive_data = True,
            archive_metadata = True,
            ingestion_configuration_id = ingestion_configuration_id, # you need to know the ingestion configuration id!
        )
        # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service

        #---------------------------
        # Set up the salinity transform
        #---------------------------


        # Create the stream
        sal_stream_id = pubsub_management_service.create_stream(stream_definition_id=sal_stream_def_id)


        # Set up the datasets
        sal_dataset_id = dataset_management_service.create_dataset(
            stream_id=sal_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule'
        )

        # Configure ingestion of the salinity as a dataset
        sal_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id = sal_dataset_id,
            archive_data = True,
            archive_metadata = True,
            ingestion_configuration_id = ingestion_configuration_id, # you need to know the ingestion configuration id!
        )
        # Hold onto sal_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service



        # Create a subscription as input to the transform
        sal_transform_input_subscription_id = pubsub_management_service.create_subscription(
            query = StreamQuery(stream_ids=[ctd_stream_id,]),
            exchange_name='salinity_transform_input') # how do we make these names??? i.e. Should they be anonymous?

        # create the salinity transform
        sal_transform_id = transform_management_service.create_transform(
            name='example salinity transform',
            in_subscription_id=sal_transform_input_subscription_id,
            out_streams={'output':sal_stream_id,},
            process_definition_id = salinity_transform_procdef_id,
            # no configuration needed at this time...
            )
        # start the transform - for a test case it makes sense to do it before starting the producer but it is not required
        transform_management_service.activate_transform(transform_id=sal_transform_id)



        # Start the ctd simulator to produce some data
        configuration = {
            'process':{
                'stream_id':ctd_stream_id,
            }
        }
        ctd_sim_pid = process_dispatcher.schedule_process(process_definition_id=ctd_sim_procdef_id, configuration=configuration)


        ###
        ### Make a subscriber in the test to listen for salinity data
        ###
        salinity_subscription_id = pubsub_management_service.create_subscription(
            query=StreamQuery([sal_stream_id,]),
            exchange_name = 'salinity_test',
            name = "test salinity subscription",
            )

        pid = cc.spawn_process(name='dummy_process_for_test',
            module='pyon.ion.process',
            cls='SimpleProcess',
            config={})
        dummy_process = cc.proc_manager.procs[pid]

        subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process, node=cc.node)

        result = gevent.event.AsyncResult()
        results = []
        def message_received(message, headers):
            # Heads
            log.warn('Salinity data received!')
            results.append(message)
            if len(results) >3:
                result.set(True)

        subscriber = subscriber_registrar.create_subscriber(exchange_name='salinity_test', callback=message_received)
        subscriber.start()

        # after the queue has been created it is safe to activate the subscription
        pubsub_management_service.activate_subscription(subscription_id=salinity_subscription_id)


        # Assert that we have received data
        assertions(result.get(timeout=10))

        # stop the flow parse the messages...
        process_dispatcher.cancel_process(ctd_sim_pid) # kill the ctd simulator process - that is enough data



        for message in results:

            psd = PointSupplementStreamParser(stream_definition=SalinityTransform.outgoing_stream_def, stream_granule=message)

            # Test the handy info method for the names of fields in the stream def
            assertions('salinity' in psd.list_field_names())


            # you have to know the name of the coverage in stream def
            salinity = psd.get_values('salinity')

            import numpy

            assertions(isinstance(salinity, numpy.ndarray))

            assertions(numpy.nanmin(salinity) > 0.0) # salinity should always be greater than 0
Пример #36
0
class DMCollaborationIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        config = DotDict()
        config.bootstrap.processes.ingestion.module = 'ion.processes.data.ingestion.ingestion_worker_a'
        config.bootstrap.processes.replay.module    = 'ion.processes.data.replay.replay_process_a'
        self.container.start_rel_from_url('res/deploy/r2dm.yml', config)


        self.datastore_name = 'test_datasets'
        self.pubsub_management    = PubsubManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()

    def subscriber_action(self, msg, header):
        if not hasattr(self,'received'):
            self.received = 0
        if not hasattr(self, 'async_done'):
            self.async_done = AsyncResult()
        self.received += 1
        if self.received >= 2:
            self.async_done.set(True)


    def test_ingest_to_replay(self):

        self.async_done = AsyncResult()
        sysname = get_sys_name()


        datastore = self.container.datastore_manager.get_datastore(self.datastore_name,'SCIDATA')


        producer_definition = ProcessDefinition(name='Example Data Producer')
        producer_definition.executable = {
            'module':'ion.processes.data.example_data_producer',
            'class' :'ExampleDataProducer'
        }

        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=producer_definition)
        
        ingestion_configuration_id = self.ingestion_management.create_ingestion_configuration(
            exchange_point_id = 'science_data',
            couch_storage=CouchStorage(datastore_name=self.datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=1
        )

        self.ingestion_management.activate_ingestion_configuration(
                ingestion_configuration_id=ingestion_configuration_id)

        stream_id = self.pubsub_management.create_stream(name='data stream')
        
        dataset_id = self.dataset_management.create_dataset(
            stream_id = stream_id, 
            datastore_name = self.datastore_name,
        )

        self.ingestion_management.create_dataset_configuration(
            dataset_id = dataset_id,
            archive_data = True,
            archive_metadata = True,
            ingestion_configuration_id = ingestion_configuration_id
        )

        configuration = {
            'process': {
                'stream_id' : stream_id
            }
        }

        self.process_dispatcher.schedule_process(process_definition_id, configuration=configuration)

        replay_id, stream_id = self.data_retriever.define_replay(dataset_id = dataset_id)

        subscriber = Subscriber(name=('%s.science_data' % sysname, 'test_queue'), callback=self.subscriber_action, binding='%s.data' % stream_id)
        gevent.spawn(subscriber.listen)

        done = False
        while not done:
            results = datastore.query_view('manifest/by_dataset')
            if len(results) >= 2:
                done = True

        self.data_retriever.start_replay(replay_id)

        self.async_done.get(timeout=10)
Пример #37
0
    def on_start(self):

        log.debug("VizStreamProducer start")

        self.data_source_name = self.CFG.get_safe('name',
                                                  'sine_wave_generator')
        self.dataset = self.CFG.get_safe('dataset', 'sinusoidal')

        # create a pubsub client and a resource registry client
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)

        # Dummy instrument related clients
        self.imsclient = InstrumentManagementServiceClient(
            node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(
            node=self.container.node)
        self.dpclient = DataProductManagementServiceClient(
            node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(
            node=self.container.node)
        self.dataset_management = DatasetManagementServiceClient()

        # create the pubsub client
        self.pubsubclient = PubsubManagementServiceClient(
            node=self.container.node)

        # Additional code for creating a dummy instrument
        """
        # Set up the preconditions. Look for an existing ingestion config
        while True:
            log.info("VisStreamLauncher:on_start: Waiting for an ingestion configuration to be available.")
            ingestion_cfgs, _ = self.rrclient.find_resources(RT.IngestionConfiguration, None, None, True)

            if len(ingestion_cfgs) > 0:
                break
            else:
                gevent.sleep(1)
        """

        # Check to see if the data_product already exists in the system (for e.g re launching the code after a crash)
        dp_ids, _ = self.rrclient.find_resources(RT.DataProduct, None,
                                                 self.data_source_name, True)
        if len(dp_ids) > 0:
            data_product_id = dp_ids[0]
        else:
            # Create InstrumentModel
            instModel_obj = IonObject(RT.InstrumentModel,
                                      name=self.data_source_name,
                                      description=self.data_source_name)
            instModel_id = self.imsclient.create_instrument_model(
                instModel_obj)

            # Create InstrumentDevice
            instDevice_obj = IonObject(RT.InstrumentDevice,
                                       name=self.data_source_name,
                                       description=self.data_source_name,
                                       serial_number="12345")

            instDevice_id = self.imsclient.create_instrument_device(
                instrument_device=instDevice_obj)
            self.imsclient.assign_instrument_model_to_instrument_device(
                instModel_id, instDevice_id)

            # create a stream definition for the data from the ctd simulator
            pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
                'ctd_parsed_param_dict', id_only=True)
            ctd_stream_def_id = self.pubsubclient.create_stream_definition(
                name="SBE37_CDM",
                description="SBE37_CDM",
                parameter_dictionary_id=pdict_id)

            tdom, sdom = time_series_domain()
            sdom = sdom.dump()
            tdom = tdom.dump()

            dp_obj = IonObject(RT.DataProduct,
                               name=self.data_source_name,
                               description='Example ctd stream',
                               temporal_domain=tdom,
                               spatial_domain=sdom)

            data_product_id = self.dpclient.create_data_product(
                dp_obj, stream_definition_id=ctd_stream_def_id)

            self.damsclient.assign_data_product(
                input_resource_id=instDevice_id,
                data_product_id=data_product_id)
            self.dpclient.activate_data_product_persistence(
                data_product_id=data_product_id)

        print ">>>>>>>>>>>>> Dataproduct for sine wave generator : ", data_product_id

        # Retrieve the id of the OUTPUT stream from the out Data Product
        stream_ids, _ = self.rrclient.find_objects(data_product_id,
                                                   PRED.hasStream, None, True)

        if self.dataset == 'sinusoidal':
            self.container.spawn_process(
                name='ctd_test.' + self.data_source_name,
                module='ion.processes.data.sinusoidal_stream_publisher',
                cls='SinusoidalCtdPublisher',
                config={'process': {
                    'stream_id': stream_ids[0]
                }})
        else:
            self.container.spawn_process(
                name='ctd_test.' + self.data_source_name,
                module='ion.processes.data.ctd_stream_publisher',
                cls='SimpleCtdPublisher',
                config={'process': {
                    'stream_id': stream_ids[0]
                }})
        """
class IngestionManagementIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.ingest_name = "basic"
        self.exchange = "testdata"

    @staticmethod
    def clean_subscriptions():
        ingestion_management = IngestionManagementServiceClient()
        pubsub = PubsubManagementServiceClient()
        rr = ResourceRegistryServiceClient()
        ingestion_config_ids = ingestion_management.list_ingestion_configurations(id_only=True)
        for ic in ingestion_config_ids:

            assocs = rr.find_associations(subject=ic, predicate=PRED.hasSubscription, id_only=False)
            for assoc in assocs:
                rr.delete_association(assoc)
                try:
                    pubsub.deactivate_subscription(assoc.o)
                except:
                    pass
                pubsub.delete_subscription(assoc.o)

    def create_ingest_config(self):
        self.queue = IngestionQueue(name="test", type="testdata")

        # Create the ingestion config
        ingestion_config_id = self.ingestion_management.create_ingestion_configuration(
            name=self.ingest_name, exchange_point_id=self.exchange, queues=[self.queue]
        )
        return ingestion_config_id

    def test_ingestion_config_crud(self):
        ingestion_config_id = self.create_ingest_config()

        ingestion_config = self.ingestion_management.read_ingestion_configuration(ingestion_config_id)
        self.assertTrue(ingestion_config.name == self.ingest_name)
        self.assertTrue(ingestion_config.queues[0].name == "test")
        self.assertTrue(ingestion_config.queues[0].type == "testdata")

        ingestion_config.name = "another"

        self.ingestion_management.update_ingestion_configuration(ingestion_config)

        # Create an association just to make sure that it will delete them
        sub = Subscription()
        sub_id, _ = self.resource_registry.create(sub)
        assoc_id, _ = self.resource_registry.create_association(
            subject=ingestion_config_id, predicate=PRED.hasSubscription, object=sub_id
        )

        self.ingestion_management.delete_ingestion_configuration(ingestion_config_id)

        with self.assertRaises(NotFound):
            self.resource_registry.read(assoc_id)

    def test_list_ingestion(self):

        # Create the ingest_config
        config_id = self.create_ingest_config()

        retval = self.ingestion_management.list_ingestion_configurations(id_only=True)
        # Nice thing about this is that it breaks if r2dm adds an ingest_config
        self.assertTrue(config_id in retval)
Пример #39
0
 def setUp(self):
     # Start container
     self._start_container()
     self.container.start_rel_from_url('res/deploy/r2deploy.yml')
     self.ingestion_management = IngestionManagementServiceClient()
Пример #40
0
class TestLoader(IonIntegrationTestCase):

    def setUp(self):
        # Start container
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')
        self.ingestion_management = IngestionManagementServiceClient()

    def assert_can_load(self, scenarios, loadui=False, loadooi=False,
            path=TESTED_DOC, ui_path='default'):
        """ perform preload for given scenarios and raise exception if there is a problem with the data """
        config = dict(op="load",
                      scenario=scenarios,
                      attachments="res/preload/r2_ioc/attachments",
                      loadui=loadui,
                      loadooi=loadooi,
                      path=path, ui_path=ui_path,
                      assets='res/preload/r2_ioc/ooi_assets',
                      bulk=loadooi,
                      ooiexclude='DataProduct,DataProductLink')
        self.container.spawn_process("Loader", "ion.processes.bootstrap.ion_loader", "IONLoader", config=config)

    @attr('PRELOAD')
    def test_ui_valid(self):
        """ make sure UI assets are valid using DEFAULT_UI_ASSETS = 'https://userexperience.oceanobservatories.org/database-exports/' """
        self.assert_can_load("BETA", loadui=True, ui_path='default')

    @attr('PRELOAD')
    def test_ui_candidates_valid(self):
        """ make sure UI assets are valid using DEFAULT_UI_ASSETS = 'https://userexperience.oceanobservatories.org/database-exports/Candidates' """
        self.assert_can_load("BETA", loadui=True, ui_path='candidate')

    @attr('PRELOAD')
    def test_assets_valid(self):
        """ make sure can load asset DB """
        self.assert_can_load("BETA,DEVS", path='master', loadooi=True)

    @attr('PRELOAD')
    def test_alpha_valid(self):
        """ make sure R2_DEMO scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self.assert_can_load("BETA,ALPHA_SYS", path='master')

    @attr('PRELOAD')
    def test_beta_valid(self):
        """ make sure R2_DEMO scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self.assert_can_load("BETA,BETA_SYS", path='master')

    @attr('PRELOAD')
    def test_devs_valid(self):
        """ make sure DEVS scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self.assert_can_load("BETA,DEVS", path='master')

    def find_object_by_name(self, name, resource_type):
        objects,_ = self.container.resource_registry.find_resources(resource_type, id_only=False)
        self.assertGreaterEqual(len(objects), 1)

        filtered_objs = [obj for obj in objects if obj.name == name]
        self.assertEquals(len(filtered_objs), 1)

        return filtered_objs[0]

    @attr('INT', group='loader')
    @attr('SMOKE', group='loader')
    def test_row_values(self):
        """ use only rows from NOSE scenario for specific names and details included in this test.
            rows in NOSE may rely on entries in BETA scenarios,
            but should not specifically test values from those scenarios.
        """

        # first make sure this scenario loads successfully
        self.assert_can_load("BETA,NOSE")

        # check for ExternalDataset
        eds = self.find_object_by_name('Test External Dataset', RT.ExternalDataset)
        edm1 = self.find_object_by_name('Test External Dataset Model', RT.ExternalDatasetModel)
        edm2,_ = self.container.resource_registry.find_objects(eds._id, PRED.hasModel, RT.ExternalDatasetModel, True)
        self.assertEquals(edm1._id, edm2[0])

        inst = self.find_object_by_name('Test Instrument Agent Instance', RT.ExternalDatasetAgentInstance)
        self.assertEquals('value1', inst.dataset_agent_config['key1'], msg='dataset_agent_config[key1] is not value1:\n%r'%inst.agent_config)

        # check for an Org
        org = self.find_object_by_name('CASPER', RT.Org)
        self.assertFalse(org.contacts is None)
        self.assertEquals('Userbrough', org.contacts[0].individual_name_family)
        self.assertEquals('primary', org.contacts[0].roles[0])

        # check data product
        dp = self.find_object_by_name('Test DP L0 CTD', RT.DataProduct)
        # should be persisted
        streams, _ = self.container.resource_registry.find_objects(dp._id, PRED.hasStream, RT.Stream, True)
        self.assertTrue(streams)
        self.assertEquals(1, len(streams))
        self.assertTrue(self.ingestion_management.is_persisted(streams[0]))
        self.assertAlmostEqual(32.88237, dp.geospatial_bounds.geospatial_latitude_limit_north,places=3)

        # but L1 data product should not be persisted
        dp = self.find_object_by_name('Test DP L1 conductivity', RT.DataProduct)
        streams, _ = self.container.resource_registry.find_objects(dp._id, PRED.hasStream, RT.Stream, True)
        self.assertEquals(1, len(streams))
        self.assertTrue(streams)
        self.assertFalse(self.ingestion_management.is_persisted(streams[0]))

        site = self.find_object_by_name('Test Instrument Site', RT.InstrumentSite)
        self.assertFalse(site.constraint_list is None)
        self.assertEquals(2, len(site.constraint_list))
        con = site.constraint_list[0]
        self.assertAlmostEqual(  32.88237, con.geospatial_latitude_limit_north, places=3)
        self.assertAlmostEqual(-117.23214, con.geospatial_longitude_limit_east, places=3)
        con = site.constraint_list[1]
        self.assertEquals('TemporalBounds', con.type_)
        # check that coordinate system was loaded
        self.assertFalse(site.coordinate_reference_system is None)

        # check that InstrumentDevice contacts are loaded
        dev = self.find_object_by_name('Unit Test SMB37', RT.InstrumentDevice)
        self.assertTrue(len(dev.contacts)==2)
        self.assertEquals('Userbrough', dev.contacts[0].individual_name_family)

        # check has attachments
        attachments = self.container.resource_registry.find_attachments(dev._id)
        self.assertTrue(len(attachments)>0)

        # check for platform agents
        agent = self.find_object_by_name('Unit Test Platform Agent', RT.PlatformAgent)
        self.assertEquals(2, len(agent.stream_configurations))
        parsed = agent.stream_configurations[1]
#        self.assertEquals('platform_eng_parsed', parsed.parameter_dictionary_name)
        self.assertEquals('ctd_parsed_param_dict', parsed.parameter_dictionary_name)
        # OBSOLETE: check that alarm was added to StreamConfig
#        self.assertEquals(1, len(parsed.alarms), msg='alarms: %r'%parsed.alarms)
#        self.assertEquals('temp', parsed.alarms[0]['kwargs']['value_id'])

        # check for platform agents
        self.find_object_by_name('Unit Test Platform Agent Instance', RT.PlatformAgentInstance)

        # check for platform model boolean values
        model = self.find_object_by_name('Nose Testing Platform Model', RT.PlatformModel)
        self.assertEquals(True, model.shore_networked)
        self.assertNotEqual('str', model.shore_networked.__class__.__name__)

        # check for data process definition
        self.find_object_by_name("Logical Transform Definition", RT.DataProcessDefinition)

        iai = self.find_object_by_name("Test InstrumentAgentInstance", RT.InstrumentAgentInstance)
        self.assertEqual({'SCHEDULER': {'VERSION': {'number': 3.0}, 'CLOCK_SYNC': 48.2, 'ACQUIRE_STATUS': {}},
                          'PARAMETERS': {"TXWAVESTATS": False, 'TXWAVEBURST': 'false', 'TXREALTIME': True}},
                        iai.startup_config)
        self.assertEqual(2, len(iai.alerts))
#        self.assertEqual({'entry': 'foo'}, iai.alerts['complex'])

        pai = self.find_object_by_name("Unit Test Platform Agent Instance", RT.PlatformAgentInstance)
        self.assertEqual({'entry': 'foo'}, pai.alerts['complex'])

        orgs, _ = self.container.resource_registry.find_subjects(RT.Org, PRED.hasResource, iai._id, True)
        self.assertEqual(1, len(orgs))
        self.assertEqual(org._id, orgs[0])
Пример #41
0
class IngestionManagementIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.ingest_name = 'basic'
        self.exchange = 'testdata'

    @staticmethod
    def clean_subscriptions():
        ingestion_management = IngestionManagementServiceClient()
        pubsub = PubsubManagementServiceClient()
        rr = ResourceRegistryServiceClient()
        ingestion_config_ids = ingestion_management.list_ingestion_configurations(
            id_only=True)
        for ic in ingestion_config_ids:
            subscription_ids, assocs = rr.find_objects(
                subject=ic, predicate=PRED.hasSubscription, id_only=True)
            for subscription_id, assoc in zip(subscription_ids, assocs):
                rr.delete_association(assoc)
                try:
                    pubsub.deactivate_subscription(subscription_id)
                except:
                    log.exception("Unable to decativate subscription: %s",
                                  subscription_id)
                pubsub.delete_subscription(subscription_id)

    def create_ingest_config(self):
        self.queue = IngestionQueue(name='test', type='testdata')

        # Create the ingestion config
        ingestion_config_id = self.ingestion_management.create_ingestion_configuration(
            name=self.ingest_name,
            exchange_point_id=self.exchange,
            queues=[self.queue])
        return ingestion_config_id

    def test_ingestion_config_crud(self):
        ingestion_config_id = self.create_ingest_config()

        ingestion_config = self.ingestion_management.read_ingestion_configuration(
            ingestion_config_id)
        self.assertTrue(ingestion_config.name == self.ingest_name)
        self.assertTrue(ingestion_config.queues[0].name == 'test')
        self.assertTrue(ingestion_config.queues[0].type == 'testdata')

        ingestion_config.name = 'another'

        self.ingestion_management.update_ingestion_configuration(
            ingestion_config)

        # Create an association just to make sure that it will delete them
        sub = Subscription()
        sub_id, _ = self.resource_registry.create(sub)
        assoc_id, _ = self.resource_registry.create_association(
            subject=ingestion_config_id,
            predicate=PRED.hasSubscription,
            object=sub_id)

        self.ingestion_management.delete_ingestion_configuration(
            ingestion_config_id)

        with self.assertRaises(NotFound):
            self.resource_registry.read(assoc_id)

    def test_list_ingestion(self):

        # Create the ingest_config
        config_id = self.create_ingest_config()

        retval = self.ingestion_management.list_ingestion_configurations(
            id_only=True)
        # Nice thing about this is that it breaks if r2dm adds an ingest_config
        self.assertTrue(config_id in retval)
Пример #42
0
    def test_blog_ingestion_replay(self):

        #-----------------------------------------------------------------------------------------------------
        # Do this statement just once in your script
        #-----------------------------------------------------------------------------------------------------
        cc = self.container

        #-------------------------------------------------------------------------------------------------------
        # Make a registrar object - this is work usually done for you by the container in a transform or data stream process
        #-------------------------------------------------------------------------------------------------------

        subscriber_registrar = StreamSubscriberRegistrar(process=cc, node=cc.node)

        #-----------------------------------------------------------------------------------------------------
        # Service clients
        #-----------------------------------------------------------------------------------------------------

        ingestion_cli = IngestionManagementServiceClient(node=cc.node)
        dr_cli = DataRetrieverServiceClient(node=cc.node)
        dsm_cli = DatasetManagementServiceClient(node=cc.node)
        pubsub_cli = PubsubManagementServiceClient(node=cc.node)

        #-------------------------------------------------------------------------------------------------------
        # Create and activate ingestion configuration
        #-------------------------------------------------------------------------------------------------------

        ingestion_configuration_id = ingestion_cli.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name='dm_datastore',datastore_profile='EXAMPLES'),
            hdf_storage=HdfStorage(),
            number_of_workers=6,
        )
        # activates the transforms... so bindings will be created in this step
        ingestion_cli.activate_ingestion_configuration(ingestion_configuration_id)

        #------------------------------------------------------------------------------------------------------
        # Create subscriber to listen to the messages published to the ingestion
        #------------------------------------------------------------------------------------------------------

        # Define the query we want
        query = ExchangeQuery()

        # Create the stateful listener to hold the captured data for comparison with replay
        captured_input = BlogListener()


        # Make a subscription to the input stream to ingestion
        subscription_id = pubsub_cli.create_subscription(query = query, exchange_name='input_capture_queue' ,name = 'input_capture_queue')


        # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here
        # Normally the container creates and starts subscribers for you when a transform process is spawned
        subscriber = subscriber_registrar.create_subscriber(exchange_name='input_capture_queue', callback=captured_input.blog_store)
        subscriber.start()

        captured_input.subscriber = subscriber

        pubsub_cli.activate_subscription(subscription_id)


        #-------------------------------------------------------------------------------------------------------
        # Launching blog scraper
        #-------------------------------------------------------------------------------------------------------

        blogs = [
            'saintsandspinners',
            'strobist',
            'voodoofunk'
        ]


        log.debug('before spawning blog scraper')

        for blog in blogs:
            config = {'process':{'type':'stream_process','blog':blog}}
            cc.spawn_process(name=blog,
                module='ion.services.dm.ingestion.example.blog_scraper',
                cls='FeedStreamer',
                config=config)

        # wait ten seconds for some data to come in...
        log.warn('Sleeping for 10 seconds to wait for some input')
        time.sleep(10)


        #------------------------------------------------------------------------------------------------------
        # For 3 posts captured, make 3 replays and verify we get back what came in
        #------------------------------------------------------------------------------------------------------


        # Cute list comprehension method does not give enough control
        #self.assertTrue(len(captured_input.blogs)>3)
        #post_ids = [id for idx, id in enumerate(captured_input.blogs.iterkeys()) if idx < 3]

        post_ids = []




        for post_id, blog in captured_input.blogs.iteritems(): # Use items not iter items - I copy of fixed length

            log.info('Captured Input: %s' % post_id)
            if len(blog.get('comments',[])) > 2:
                post_ids.append(post_id)

            if len(post_ids) >3:
                break

        ###=======================================================
        ### This section is not scriptable
        ###=======================================================


        if len(post_ids) < 3:
            self.fail('Not enough comments returned by the blog scrappers in 30 seconds')

        if len(captured_input.blogs) < 1:
            self.fail('No data returned in ten seconds by the blog scrappers!')

        ###=======================================================
        ### End non-scriptable
        ###=======================================================


        #------------------------------------------------------------------------------------------------------
        # Create subscriber to listen to the replays
        #------------------------------------------------------------------------------------------------------

        captured_replays = {}

        for idx, post_id in enumerate(post_ids):
            # Create the stateful listener to hold the captured data for comparison with replay


            dataset_id = dsm_cli.create_dataset(
                stream_id=post_id,
                datastore_name='dm_datastore',
                view_name='posts/posts_join_comments')

            replay_id, stream_id =dr_cli.define_replay(dataset_id)


            query = StreamQuery(stream_ids=[stream_id])

            captured_replay = BlogListener()

            #------------------------------------------------------------------------------------------------------
            # Create subscriber to listen to the messages published to the ingestion
            #------------------------------------------------------------------------------------------------------

            # Make a subscription to the input stream to ingestion
            subscription_name = 'replay_capture_queue_%d' % idx
            subscription_id = pubsub_cli.create_subscription(query = query, exchange_name=subscription_name ,name = subscription_name)


            # It is not required or even generally a good idea to use the subscription resource name as the queue name, but it makes things simple here
            # Normally the container creates and starts subscribers for you when a transform process is spawned
            subscriber = subscriber_registrar.create_subscriber(exchange_name=subscription_name, callback=captured_replay.blog_store)
            subscriber.start()

            captured_replay.subscriber = subscriber

            pubsub_cli.activate_subscription(subscription_id)

            #------------------------------------------------------------------------------------------------------
            # Start the replay and listen to the results!
            #------------------------------------------------------------------------------------------------------

            dr_cli.start_replay(replay_id)

            captured_replays[post_id] = captured_replay


        ###=======================================================
        ### The rest is not scriptable
        ###=======================================================


        # wait five seconds for some data to come in...
        log.warn('Sleeping for 5 seconds to wait for some output')
        time.sleep(5)

        matched_comments={}
        for post_id, captured_replay in captured_replays.iteritems():

            # There should be only one blog in here!
            self.assertEqual(len(captured_replay.blogs),1)

            replayed_blog = captured_replay.blogs[post_id]

            input_blog = captured_input.blogs[post_id]

            self.assertEqual(replayed_blog['post'].content, input_blog['post'].content)

            # can't deterministically assert that the number of comments is the same...
            matched_comments[post_id] = 0

            for updated, comment in replayed_blog.get('comments',{}).iteritems():
                self.assertIn(updated, input_blog['comments'])
                matched_comments[post_id] += 1


        # Assert that we got some comments back!
        self.assertTrue(sum(matched_comments.values()) > 0)

        log.info('Matched comments on the following blogs: %s' % matched_comments)
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP, 'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1)

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[
            data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition)
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id)

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id')
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=ingestion_configuration_id)
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields = [
            'conductivity', 'height', 'latitude', 'longitude', 'pressure',
            'temperature', 'time'
        ]

        input_vectors = acquire_data([input_file_path], fields, 2).next()

        producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(
            dataset_id)
        ar = gevent.event.AsyncResult()

        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP, 'replay listener'),
                                callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen,
                            binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
class DirectCoverageAccess(object):
    def __init__(self):
        self.ingestion_management = IngestionManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.data_product_management = DataProductManagementServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self._paused_streams = []
        self._w_covs = {}
        self._ro_covs = {}

        self._context_managed = False

    def __enter__(self):
        self._context_managed = True
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.clean_up()

    def clean_up(self, ro_covs=False, w_covs=False, streams=False):
        if not ro_covs and not w_covs and not streams:
            ro_covs = w_covs = streams = True

        if ro_covs:
            # Close any open read-only coverages
            for dsid, c in self._ro_covs.iteritems():
                c.close()

        if w_covs:
            # Close any open write coverages
            for sid, c in self._w_covs.iteritems():
                c.close()

        if streams:
            # Resume any paused ingestion workers
            for s in self._paused_streams:
                self.resume_ingestion(s)

    def get_ingestion_config(self):
        '''
        Grab the ingestion configuration from the resource registry
        '''
        # The ingestion configuration should have been created by the bootstrap service
        # which is configured through r2deploy.yml

        ingest_configs, _ = self.resource_registry.find_resources(restype=RT.IngestionConfiguration,id_only=True)
        return ingest_configs[0]

    def get_coverage_path(self, dataset_id):
        pth = DatasetManagementService._get_coverage_path(dataset_id)
        if not os.path.exists(pth):
            raise ValueError('Coverage with id \'{0}\' does not exist!'.format(dataset_id))

        return pth

    def pause_ingestion(self, stream_id):
        if not self._context_managed:
            warn_user('Warning: Pausing ingestion when not using a context manager is potentially unsafe - '
                           'be sure to resume ingestion for all streams by calling self.clean_up(streams=True)')

        if stream_id not in self._paused_streams:
            self.ingestion_management.pause_data_stream(stream_id, self.get_ingestion_config())
            self._paused_streams.append(stream_id)

    def resume_ingestion(self, stream_id):
        if stream_id in self._paused_streams:
            self.ingestion_management.resume_data_stream(stream_id, self.get_ingestion_config())
            self._paused_streams.remove(stream_id)

    def get_stream_id(self, dataset_id):
        sid, _ = self.resource_registry.find_objects(dataset_id, predicate=PRED.hasStream, id_only=True)
        return sid[0] if len(sid) > 0 else None

    def get_dataset_object(self, dataset_id):
        return self.dataset_management.read_dataset(dataset_id=dataset_id)

    def get_data_product_object(self, data_product_id):
        return self.data_product_management.read_data_product(data_product_id=data_product_id)

    def get_read_only_coverage(self, dataset_id):
        if not self._context_managed:
            warn_user('Warning: Coverages will remain open until they are closed or go out of scope - '
                           'be sure to close coverage instances when you are finished working with them or call self.clean_up(ro_covs=True)')

        # Check if we already have the coverage
        if dataset_id in self._ro_covs:
            cov = self._ro_covs[dataset_id]
            # If it's not closed, return it
            if not cov.closed:
                return cov
            # Otherwise, remove it from self._ro_covs and carry on
            del self._ro_covs[dataset_id]

        self._ro_covs[dataset_id] = DatasetManagementService._get_coverage(dataset_id, mode='r')

        return self._ro_covs[dataset_id]

    def get_editable_coverage(self, dataset_id):
        sid = self.get_stream_id(dataset_id)

        # Check if we already have the coverage
        if sid in self._paused_streams:
            cov = self._w_covs[sid]
            # If it's not closed, return it
            if not cov.closed:
                return cov
            # Otherwise, remove it from self._ro_covs and carry on
            del self._w_covs[sid]

        self.pause_ingestion(sid)
        if not self._context_managed:
            warn_user('Warning: Coverages will remain open until they are closed or go out of scope - '
                           'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)')
        try:
            self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w')
            return self._w_covs[sid]
        except:
            self.resume_ingestion(sid)
            raise

    @classmethod
    def get_parser(cls, data_file_path, config_path=None):
        return SimpleDelimitedParser.get_parser(data_file_path, config_path=config_path)

    def manual_upload(self, dataset_id, data_file_path, config_path=None):
        # First, ensure we can get a parser and parse the data file
        parser = self.get_parser(data_file_path, config_path)
        dat = parser.parse()

        # Get the coverage
        with self.get_editable_coverage(dataset_id) as cov:
            # Find the indices for the times in the data file
            try:
                time_dat = dat[cov.temporal_parameter_name]
            except ValueError, ve:
                if ve.message == 'field named %s not found.' % cov.temporal_parameter_name:
                    raise ValueError('Temporal parameter name {0} not in upload data'.format(cov.temporal_parameter_name))
                else:
                    raise
            cov_times = cov.get_time_values()
            tinds = [utils.find_nearest_index(cov_times, ti) for ti in time_dat]

            sl = (tinds,)
            cparams = cov.list_parameters()
            for n in dat.dtype.names:
                if n != cov.temporal_parameter_name:
                    if n in cparams:
                        cov.set_parameter_values(n, dat[n], sl)
                    else:
                        warn_user('Skipping column \'%s\': matching parameter not found in coverage!' % n)
Пример #45
0
class TestLoader(IonIntegrationTestCase):
    def setUp(self):
        # Start container
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2deploy.yml')
        self.ingestion_management = IngestionManagementServiceClient()
        self.rr = self.container.resource_registry

    def _perform_preload(self, load_cfg):
        #load_cfg["ui_path"] = "res/preload/r2_ioc/ui_assets"
        #load_cfg["path"] = "R2PreloadedResources.xlsx"
        #load_cfg["assetmappings"] = "OOIPreload.xlsx"
        self.container.spawn_process("Loader",
                                     "ion.processes.bootstrap.ion_loader",
                                     "IONLoader",
                                     config=load_cfg)

    def _preload_instrument(self, inst_scenario):
        load_cfg = dict(
            op="load",
            scenario=inst_scenario,
            attachments="res/preload/r2_ioc/attachments",
            assets='res/preload/r2_ioc/ooi_assets',
        )
        self._perform_preload(load_cfg)

    def _preload_ui(self, ui_path="default"):
        load_cfg = dict(
            op="load",
            loadui=True,
            ui_path=ui_path,
        )
        self._perform_preload(load_cfg)

    def _preload_cfg(self, cfg, path=TEST_PATH):
        load_cfg = dict(cfg=cfg, path=path)
        self._perform_preload(load_cfg)

    def _preload_scenario(self,
                          scenario,
                          path=TEST_PATH,
                          idmap=False,
                          **kwargs):
        load_cfg = dict(op="load",
                        scenario=scenario,
                        attachments="res/preload/r2_ioc/attachments",
                        path=path,
                        idmap=idmap)
        load_cfg.update(kwargs)
        self._perform_preload(load_cfg)

    def _preload_ooi(self, path=TEST_PATH):
        load_cfg = dict(
            op="load",
            loadooi=True,
            assets="res/preload/r2_ioc/ooi_assets",
            path=path,
            ooiuntil="12/31/2013",
        )
        self._perform_preload(load_cfg)

    # -------------------------------------------------------------------------

    @attr('PRELOAD')
    def test_ui_valid(self):
        """ make sure UI assets are valid using DEFAULT_UI_ASSETS = 'http://userexperience.oceanobservatories.org/database-exports/Stable' """
        self._preload_ui(ui_path='default')
        obj_list, _ = self.rr.find_resources(restype=RT.UISpec,
                                             name="ION UI Specs",
                                             id_only=False)
        self.assertEquals(len(obj_list), 1)

    @attr('PRELOAD')
    def test_ui_candidates_valid(self):
        """ make sure UI assets are valid using DEFAULT_UI_ASSETS = 'http://userexperience.oceanobservatories.org/database-exports/Candidates' """
        self._preload_ui(ui_path='candidate')
        obj_list, _ = self.rr.find_resources(restype=RT.UISpec,
                                             name="ION UI Specs",
                                             id_only=False)
        self.assertEquals(len(obj_list), 1)

    @attr('PRELOAD')
    def test_betademo_valid(self):
        """ make sure can load asset DB """
        self._preload_scenario("BETA,R2_DEMO,RSN_OMS", path=TEST_PATH)
        self._preload_ooi(path=TEST_PATH)

        # check that deployment port assignments subobject  created correctly

        #collect a set of deployments
        deploy_list = []
        #DEP3 of PDEV3
        obj_list, _ = self.rr.find_resources(restype=RT.Deployment,
                                             name="Platform Deployment",
                                             id_only=False)
        deploy_list.extend(obj_list)
        log.debug('test_betademo_valid DEP3:  %s ', obj_list)
        #DEP4 of PDEV4
        obj_list, _ = self.rr.find_resources(restype=RT.Deployment,
                                             name="dep4",
                                             id_only=False)
        log.debug('test_betademo_valid DEP4:  %s ', obj_list)
        deploy_list.extend(obj_list)
        self.assertEquals(len(deploy_list), 2)

        for dply_obj in deploy_list:

            for dev_id, platform_port in dply_obj.port_assignments.iteritems():
                # all values in the port assignments dict should be PlatformPort objects
                self.assertEquals(platform_port.type_, OT.PlatformPort)

    @attr('PRELOAD')
    def test_incremental(self):
        """ make sure R2_DEMO scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self._preload_cfg("res/preload/r2_ioc/config/ooi_load_config.yml",
                          path=TEST_PATH)
        self._preload_scenario("OOIR2_DEMO", path=TEST_PATH, idmap=True)

        dp_list1, _ = self.rr.find_resources(restype=RT.DataProduct,
                                             id_only=True)
        ia_list1, _ = self.rr.find_resources(restype=RT.InstrumentAgent,
                                             id_only=True)

        self._preload_cfg("res/preload/r2_ioc/config/ooi_instruments.yml",
                          path=TEST_PATH)

        ia_list2, _ = self.rr.find_resources(restype=RT.InstrumentAgent,
                                             id_only=True)
        self.assertGreater(len(ia_list2), len(ia_list1))
        dp_list2, _ = self.rr.find_resources(restype=RT.DataProduct,
                                             id_only=True)
        self.assertGreater(len(dp_list2), len(dp_list1))
        id_list2, _ = self.rr.find_resources(restype=RT.InstrumentDevice,
                                             id_only=True)

        self._preload_ooi(path=TEST_PATH)

        dp_list3, _ = self.rr.find_resources(restype=RT.DataProduct,
                                             id_only=True)
        self.assertGreater(len(dp_list3), len(dp_list2))
        id_list3, _ = self.rr.find_resources(restype=RT.InstrumentDevice,
                                             id_only=True)
        self.assertEquals(len(id_list3), len(id_list2))

        self._preload_ooi(path=TEST_PATH)

        dp_list4, _ = self.rr.find_resources(restype=RT.DataProduct,
                                             id_only=True)
        self.assertEquals(len(dp_list4), len(dp_list3))
        id_list4, _ = self.rr.find_resources(restype=RT.InstrumentDevice,
                                             id_only=True)
        self.assertEquals(len(id_list4), len(id_list3))

    def find_object_by_name(self, name, resource_type):
        objects, _ = self.container.resource_registry.find_resources(
            resource_type, name=name, id_only=False)
        self.assertEquals(len(objects), 1)

        return objects[0]

    @attr('INT', group='loader')
    @attr('SMOKE', group='loader')
    def test_row_values(self):
        """ use only rows from NOSE scenario for specific names and details included in this test.
            rows in NOSE may rely on entries in BETA scenarios,
            but should not specifically test values from those scenarios.
        """

        # first make sure this scenario loads successfully
        self._preload_scenario("BETA,NOSE")

        # check for ExternalDataset
        eds = self.find_object_by_name('Test External CTD Dataset',
                                       RT.ExternalDataset)
        edm1 = self.find_object_by_name('Test External CTD Dataset Model',
                                        RT.ExternalDatasetModel)
        edm2, _ = self.container.resource_registry.find_objects(
            eds._id, PRED.hasModel, RT.ExternalDatasetModel, True)
        self.assertEquals(edm1._id, edm2[0])

        inst = self.find_object_by_name('Test External CTD Agent Instance',
                                        RT.ExternalDatasetAgentInstance)
        self.assertEquals('value1',
                          inst.driver_config['key1'],
                          msg='driver_config[key1] is not value1:\n%r' %
                          inst.driver_config)

        # check for an Org
        org = self.find_object_by_name('CASPER', RT.Org)
        self.assertFalse(org.contacts is None)
        self.assertEquals('Userbrough', org.contacts[0].individual_name_family)
        self.assertEquals('primary', org.contacts[0].roles[0])

        # check data product
        dp = self.find_object_by_name('Test DP L0 CTD', RT.DataProduct)
        # should be persisted
        streams, _ = self.container.resource_registry.find_objects(
            dp._id, PRED.hasStream, RT.Stream, True)
        self.assertTrue(streams)
        self.assertEquals(1, len(streams))
        self.assertTrue(self.ingestion_management.is_persisted(streams[0]))
        self.assertAlmostEqual(
            32.88237,
            dp.geospatial_bounds.geospatial_latitude_limit_north,
            places=3)

        # but L1 data product should not be persisted
        dp = self.find_object_by_name('Test DP L1 conductivity',
                                      RT.DataProduct)
        streams, _ = self.container.resource_registry.find_objects(
            dp._id, PRED.hasStream, RT.Stream, True)
        self.assertEquals(1, len(streams))
        self.assertTrue(streams)
        self.assertFalse(self.ingestion_management.is_persisted(streams[0]))

        site = self.find_object_by_name('Test Instrument Site',
                                        RT.InstrumentSite)
        self.assertFalse(site.constraint_list is None)
        self.assertEquals(2, len(site.constraint_list))
        con = site.constraint_list[0]
        self.assertAlmostEqual(32.88237,
                               con.geospatial_latitude_limit_north,
                               places=3)
        self.assertAlmostEqual(-117.23214,
                               con.geospatial_longitude_limit_east,
                               places=3)
        con = site.constraint_list[1]
        self.assertEquals('TemporalBounds', con.type_)
        # check that coordinate system was loaded
        self.assertFalse(site.coordinate_reference_system is None)

        # check that InstrumentDevice contacts are loaded
        dev = self.find_object_by_name('Unit Test SMB37', RT.InstrumentDevice)
        self.assertTrue(len(dev.contacts) == 2)
        self.assertEquals('Userbrough', dev.contacts[0].individual_name_family)

        # check has attachments
        attachments = self.container.resource_registry.find_attachments(
            dev._id)
        self.assertTrue(len(attachments) > 0)

        # check for platform agents
        agent = self.find_object_by_name('Unit Test Platform Agent',
                                         RT.PlatformAgent)
        self.assertEquals(2, len(agent.stream_configurations))
        parsed = agent.stream_configurations[1]
        #        self.assertEquals('platform_eng_parsed', parsed.parameter_dictionary_name)
        self.assertEquals('ctd_parsed_param_dict',
                          parsed.parameter_dictionary_name)
        # OBSOLETE: check that alarm was added to StreamConfig
        #        self.assertEquals(1, len(parsed.alarms), msg='alarms: %r'%parsed.alarms)
        #        self.assertEquals('temp', parsed.alarms[0]['kwargs']['value_id'])

        # check for platform agents
        self.find_object_by_name('Unit Test Platform Agent Instance',
                                 RT.PlatformAgentInstance)

        # check for platform model boolean values
        model = self.find_object_by_name('Nose Testing Platform Model',
                                         RT.PlatformModel)
        self.assertEquals(True, model.shore_networked)
        self.assertNotEqual('str', model.shore_networked.__class__.__name__)

        iai = self.find_object_by_name("Test InstrumentAgentInstance",
                                       RT.InstrumentAgentInstance)
        self.assertEqual(
            {
                'SCHEDULER': {
                    'VERSION': {
                        'number': 3.0
                    },
                    'CLOCK_SYNC': 48.2,
                    'ACQUIRE_STATUS': {}
                },
                'PARAMETERS': {
                    "TXWAVESTATS": False,
                    'TXWAVEBURST': 'false',
                    'TXREALTIME': True
                }
            }, iai.startup_config)
        self.assertEqual(2, len(iai.alerts))

        pai = self.find_object_by_name("Unit Test Platform Agent Instance",
                                       RT.PlatformAgentInstance)
        self.assertEqual(1, len(pai.alerts))
        self.assertTrue(pai.agent_config.has_key('platform_config'))
        log.debug('test_row_values PlatformAgentInstance driver_config: %s ',
                  pai.driver_config)

        self.assertTrue(pai.driver_config.has_key('oms_uri'))
        oms_uri = pai.driver_config['oms_uri']
        log.debug('test_row_values PlatformAgentInstance oms_uri: %s ',
                  oms_uri)

        self.assertEquals('http://*****:*****@10.180.80.10:9021/', oms_uri)

        orgs, _ = self.container.resource_registry.find_subjects(
            RT.Org, PRED.hasResource, iai._id, True)
        self.assertEqual(1, len(orgs))
        self.assertEqual(org._id, orgs[0])

        entries, _ = self.container.resource_registry.find_resources(
            RT.SchedulerEntry, id_only=False)
        self.assertGreaterEqual(len(entries), 1)

    @attr('PRELOAD')
    def test_alpha_valid(self):
        """ make sure R2_DEMO scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self._preload_cfg("res/preload/r2_ioc/config/ooi_alpha.yml",
                          path=TEST_PATH)

    @attr('PRELOAD')
    def test_beta_valid(self):
        """ make sure R2_DEMO scenario in master google doc
            is valid and self-contained (doesn't rely on rows from other scenarios except BETA)
            NOTE: test will pass/fail based on current google doc, not just code changes.
        """
        self._preload_cfg("res/preload/r2_ioc/config/ooi_beta.yml",
                          path=TEST_PATH)

        failure_list = []

        def add_failure(res_obj, msg):
            fail_msg = "%s[%s/%s]: %s" % (res_obj.type_, res_obj._id,
                                          res_obj.name, msg)
            failure_list.append(fail_msg)

        log.warn("Starting preload assertions now")

        res_objs, res_keys = self.rr.find_resources_ext(alt_id_ns="PRE",
                                                        id_only=False)

        log.info("Found %s preloaded resources", len(res_objs))

        dp_objs = [res for res in res_objs if res.type_ == RT.DataProduct]

        log.info("Checking %s DataProducts", len(dp_objs))
        for dp in dp_objs:
            pass
            # Reenable this when we have geospatial coordinates for PNs
            #if not all([dp.geospatial_bounds.geospatial_latitude_limit_north,
            #    dp.geospatial_bounds.geospatial_latitude_limit_south,
            #    dp.geospatial_bounds.geospatial_longitude_limit_east,
            #    dp.geospatial_bounds.geospatial_longitude_limit_west]):
            #    add_failure(dp, "geospatial_bounds location invalid: %s" % dp.geospatial_bounds)

            #if not all([dp.geospatial_bounds.geospatial_vertical_min,
            #    dp.geospatial_bounds.geospatial_vertical_max]):
            #    add_failure(dp, "geospatial_bounds vertical invalid: %s" % dp.geospatial_bounds)

        if failure_list:
            fail_msg = "Preload assertions violated:\n" + "\n".join(
                f for f in failure_list)
            self.fail(fail_msg)
Пример #46
0
    def test_dm_integration(self):
        '''
        test_salinity_transform
        Test full DM Services Integration
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here to run as a script (don't forget the imports of course!)
        #-----------------------------

        # Create some service clients...
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(
            node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        # declare some handy variables

        datastore_name = 'test_dm_integration'

        ###
        ### In the beginning there were two stream definitions...
        ###
        # create a stream definition for the data from the ctd simulator
        ctd_stream_def = SBE37_CDM_stream_definition()
        ctd_stream_def_id = pubsub_management_service.create_stream_definition(
            container=ctd_stream_def, name='Simulated CTD data')

        # create a stream definition for the data from the salinity Transform
        sal_stream_def_id = pubsub_management_service.create_stream_definition(
            container=SalinityTransform.outgoing_stream_def,
            name='Scalar Salinity data stream')

        ###
        ### And two process definitions...
        ###
        # one for the ctd simulator...
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module': 'ion.processes.data.ctd_stream_publisher',
            'class': 'SimpleCtdPublisher'
        }

        ctd_sim_procdef_id = process_dispatcher.create_process_definition(
            process_definition=producer_definition)

        # one for the salinity transform
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module': 'ion.processes.data.transforms.ctd.ctd_L2_salinity',
            'class': 'SalinityTransform'
        }

        salinity_transform_procdef_id = process_dispatcher.create_process_definition(
            process_definition=producer_definition)

        #---------------------------
        # Set up ingestion - this is an operator concern - not done by SA in a deployed system
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            number_of_workers=1)
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        #---------------------------
        # Set up the producer (CTD Simulator)
        #---------------------------

        # Create the stream
        ctd_stream_id = pubsub_management_service.create_stream(
            stream_definition_id=ctd_stream_def_id)

        # Set up the datasets
        ctd_dataset_id = dataset_management_service.create_dataset(
            stream_id=ctd_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule')

        # Configure ingestion of this dataset
        ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id=ctd_dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=
            ingestion_configuration_id,  # you need to know the ingestion configuration id!
        )
        # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service

        #---------------------------
        # Set up the salinity transform
        #---------------------------

        # Create the stream
        sal_stream_id = pubsub_management_service.create_stream(
            stream_definition_id=sal_stream_def_id)

        # Set up the datasets
        sal_dataset_id = dataset_management_service.create_dataset(
            stream_id=sal_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule')

        # Configure ingestion of the salinity as a dataset
        sal_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id=sal_dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=
            ingestion_configuration_id,  # you need to know the ingestion configuration id!
        )
        # Hold onto sal_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service

        # Create a subscription as input to the transform
        sal_transform_input_subscription_id = pubsub_management_service.create_subscription(
            query=StreamQuery(stream_ids=[
                ctd_stream_id,
            ]),
            exchange_name='salinity_transform_input'
        )  # how do we make these names??? i.e. Should they be anonymous?

        # create the salinity transform
        sal_transform_id = transform_management_service.create_transform(
            name='example salinity transform',
            in_subscription_id=sal_transform_input_subscription_id,
            out_streams={
                'output': sal_stream_id,
            },
            process_definition_id=salinity_transform_procdef_id,
            # no configuration needed at this time...
        )
        # start the transform - for a test case it makes sense to do it before starting the producer but it is not required
        transform_management_service.activate_transform(
            transform_id=sal_transform_id)

        # Start the ctd simulator to produce some data
        configuration = {
            'process': {
                'stream_id': ctd_stream_id,
            }
        }
        ctd_sim_pid = process_dispatcher.schedule_process(
            process_definition_id=ctd_sim_procdef_id,
            configuration=configuration)

        ###
        ### Make a subscriber in the test to listen for salinity data
        ###
        salinity_subscription_id = pubsub_management_service.create_subscription(
            query=StreamQuery([
                sal_stream_id,
            ]),
            exchange_name='salinity_test',
            name="test salinity subscription",
        )

        pid = cc.spawn_process(name='dummy_process_for_test',
                               module='pyon.ion.process',
                               cls='SimpleProcess',
                               config={})
        dummy_process = cc.proc_manager.procs[pid]

        subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process,
                                                         node=cc.node)

        result = gevent.event.AsyncResult()
        results = []

        def message_received(message, headers):
            # Heads
            log.warn('Salinity data received!')
            results.append(message)
            if len(results) > 3:
                result.set(True)

        subscriber = subscriber_registrar.create_subscriber(
            exchange_name='salinity_test', callback=message_received)
        subscriber.start()

        # after the queue has been created it is safe to activate the subscription
        pubsub_management_service.activate_subscription(
            subscription_id=salinity_subscription_id)

        # Assert that we have received data
        assertions(result.get(timeout=10))

        # stop the flow parse the messages...
        process_dispatcher.cancel_process(
            ctd_sim_pid
        )  # kill the ctd simulator process - that is enough data

        for message in results:

            psd = PointSupplementStreamParser(
                stream_definition=SalinityTransform.outgoing_stream_def,
                stream_granule=message)

            # Test the handy info method for the names of fields in the stream def
            assertions('salinity' in psd.list_field_names())

            # you have to know the name of the coverage in stream def
            salinity = psd.get_values('salinity')

            import numpy

            assertions(isinstance(salinity, numpy.ndarray))

            assertions(numpy.nanmin(salinity) >
                       0.0)  # salinity should always be greater than 0
Пример #47
0
class TestDMEnd2End(IonIntegrationTestCase):
    def setUp(self): # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.pubsub_management    = PubsubManagementServiceClient()
        self.resource_registry    = ResourceRegistryServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()
        self.pids                 = []
        self.event                = Event()
        self.exchange_space_name  = 'test_granules'
        self.exchange_point_name  = 'science_data'       
        self.i                    = 0

        self.purge_queues()
        self.queue_buffer         = []
        self.streams = []
        self.addCleanup(self.stop_all_ingestion)

    def purge_queues(self):
        xn = self.container.ex_manager.create_xn_queue('science_granule_ingestion')
        xn.purge()
        

    def tearDown(self):
        self.purge_queues()
        for pid in self.pids:
            self.container.proc_manager.terminate_process(pid)
        IngestionManagementIntTest.clean_subscriptions()
        for queue in self.queue_buffer:
            if isinstance(queue, ExchangeNameQueue):
                queue.delete()
            elif isinstance(queue, str):
                xn = self.container.ex_manager.create_xn_queue(queue)
                xn.delete()

    #--------------------------------------------------------------------------------
    # Helper/Utility methods
    #--------------------------------------------------------------------------------
        
    def create_dataset(self, parameter_dict_id=''):
        '''
        Creates a time-series dataset
        '''
        tdom, sdom = time_series_domain()
        sdom = sdom.dump()
        tdom = tdom.dump()
        if not parameter_dict_id:
            parameter_dict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)

        dataset_id = self.dataset_management.create_dataset('test_dataset_%i'%self.i, parameter_dictionary_id=parameter_dict_id, spatial_domain=sdom, temporal_domain=tdom)
        return dataset_id
    
    def get_datastore(self, dataset_id):
        '''
        Gets an instance of the datastore
            This method is primarily used to defeat a bug where integration tests in multiple containers may sometimes 
            delete a CouchDB datastore and the other containers are unaware of the new state of the datastore.
        '''
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore
    
    def get_ingestion_config(self):
        '''
        Grab the ingestion configuration from the resource registry
        '''
        # The ingestion configuration should have been created by the bootstrap service 
        # which is configured through r2deploy.yml

        ingest_configs, _  = self.resource_registry.find_resources(restype=RT.IngestionConfiguration,id_only=True)
        return ingest_configs[0]

    def launch_producer(self, stream_id=''):
        '''
        Launch the producer
        '''

        pid = self.container.spawn_process('better_data_producer', 'ion.processes.data.example_data_producer', 'BetterDataProducer', {'process':{'stream_id':stream_id}})

        self.pids.append(pid)

    def make_simple_dataset(self):
        '''
        Makes a stream, a stream definition and a dataset, the essentials for most of these tests
        '''
        pdict_id             = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        stream_def_id        = self.pubsub_management.create_stream_definition('ctd data', parameter_dictionary_id=pdict_id)
        stream_id, route     = self.pubsub_management.create_stream('ctd stream %i' % self.i, 'xp1', stream_definition_id=stream_def_id)

        dataset_id = self.create_dataset(pdict_id)

        self.get_datastore(dataset_id)
        self.i += 1
        return stream_id, route, stream_def_id, dataset_id

    def publish_hifi(self,stream_id,stream_route,offset=0):
        '''
        Publish deterministic data
        '''

        pub = StandaloneStreamPublisher(stream_id, stream_route)

        stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id)
        stream_def_id = stream_def._id
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10) + (offset * 10)
        rdt['temp'] = np.arange(10) + (offset * 10)
        pub.publish(rdt.to_granule())

    def publish_fake_data(self,stream_id, route):
        '''
        Make four granules
        '''
        for i in xrange(4):
            self.publish_hifi(stream_id,route,i)

    def start_ingestion(self, stream_id, dataset_id):
        '''
        Starts ingestion/persistence for a given dataset
        '''
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id)
    
    def stop_ingestion(self, stream_id):
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id)
        
    def stop_all_ingestion(self):
        try:
            [self.stop_ingestion(sid) for sid in self.streams]
        except:
            pass

    def validate_granule_subscription(self, msg, route, stream_id):
        '''
        Validation for granule format
        '''
        if msg == {}:
            return
        rdt = RecordDictionaryTool.load_from_granule(msg)
        log.info('%s', rdt.pretty_print())
        self.assertIsInstance(msg,Granule,'Message is improperly formatted. (%s)' % type(msg))
        self.event.set()

    def wait_until_we_have_enough_granules(self, dataset_id='',data_size=40):
        '''
        Loops until there is a sufficient amount of data in the dataset
        '''
        done = False
        with gevent.Timeout(40):
            while not done:
                extents = self.dataset_management.dataset_extents(dataset_id, 'time')[0]
                granule = self.data_retriever.retrieve_last_data_points(dataset_id, 1)
                rdt     = RecordDictionaryTool.load_from_granule(granule)
                if rdt['time'] and rdt['time'][0] != rdt._pdict.get_context('time').fill_value and extents >= data_size:
                    done = True
                else:
                    gevent.sleep(0.2)




    #--------------------------------------------------------------------------------
    # Test Methods
    #--------------------------------------------------------------------------------

    @attr('SMOKE') 
    def test_dm_end_2_end(self):
        #--------------------------------------------------------------------------------
        # Set up a stream and have a mock instrument (producer) send data
        #--------------------------------------------------------------------------------
        self.event.clear()

        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        
        stream_definition = self.pubsub_management.create_stream_definition('ctd data', parameter_dictionary_id=pdict_id)


        stream_id, route = self.pubsub_management.create_stream('producer', exchange_point=self.exchange_point_name, stream_definition_id=stream_definition)




        #--------------------------------------------------------------------------------
        # Start persisting the data on the stream 
        # - Get the ingestion configuration from the resource registry
        # - Create the dataset
        # - call persist_data_stream to setup the subscription for the ingestion workers
        #   on the stream that you specify which causes the data to be persisted
        #--------------------------------------------------------------------------------

        ingest_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id)

        #--------------------------------------------------------------------------------
        # Now the granules are ingesting and persisted
        #--------------------------------------------------------------------------------

        self.launch_producer(stream_id)
        self.wait_until_we_have_enough_granules(dataset_id,40)
        
        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------
        
        replay_data = self.data_retriever.retrieve(dataset_id)
        self.assertIsInstance(replay_data, Granule)
        rdt = RecordDictionaryTool.load_from_granule(replay_data)
        self.assertTrue((rdt['time'][:10] == np.arange(10)).all(),'%s' % rdt['time'][:])
        self.assertTrue((rdt['binary'][:10] == np.array(['hi']*10, dtype='object')).all())

        
        #--------------------------------------------------------------------------------
        # Now to try the streamed approach
        #--------------------------------------------------------------------------------
        replay_stream_id, replay_route = self.pubsub_management.create_stream('replay_out', exchange_point=self.exchange_point_name, stream_definition_id=stream_definition)
        self.replay_id, process_id =  self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream_id)
        log.info('Process ID: %s', process_id)

        replay_client = ReplayClient(process_id)

    
        #--------------------------------------------------------------------------------
        # Create the listening endpoint for the the retriever to talk to 
        #--------------------------------------------------------------------------------
        xp = self.container.ex_manager.create_xp(self.exchange_point_name)
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        self.data_retriever.start_replay_agent(self.replay_id)

        self.assertTrue(replay_client.await_agent_ready(5), 'The process never launched')
        replay_client.start_replay()
        
        self.assertTrue(self.event.wait(10))
        subscriber.stop()

        self.data_retriever.cancel_replay_agent(self.replay_id)


        #--------------------------------------------------------------------------------
        # Test the slicing capabilities
        #--------------------------------------------------------------------------------

        granule = self.data_retriever.retrieve(dataset_id=dataset_id, query={'tdoa':slice(0,5)})
        rdt = RecordDictionaryTool.load_from_granule(granule)
        b = rdt['time'] == np.arange(5)
        self.assertTrue(b.all() if not isinstance(b,bool) else b)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)


    def test_coverage_transform(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_parsed()
        stream_def_id = self.pubsub_management.create_stream_definition('ctd parsed', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)

        stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)

        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id)
        self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id)
        publisher = StandaloneStreamPublisher(stream_id, route)
        
        rdt = ph.get_rdt(stream_def_id)
        ph.fill_parsed_rdt(rdt)

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_monitor.event.wait(30))

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], rdt['time'])
        np.testing.assert_array_almost_equal(rdt_out['temp'], rdt['temp'])

        np.testing.assert_array_almost_equal(rdt_out['conductivity_L1'], np.array([42.914]))
        np.testing.assert_array_almost_equal(rdt_out['temp_L1'], np.array([20.]))
        np.testing.assert_array_almost_equal(rdt_out['pressure_L1'], np.array([3.068]))
        np.testing.assert_array_almost_equal(rdt_out['density'], np.array([1021.7144739593881]))
        np.testing.assert_array_almost_equal(rdt_out['salinity'], np.array([30.935132729668283]))


    def test_qc_events(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_qc_pdict()
        stream_def_id = self.pubsub_management.create_stream_definition('qc stream def', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)

        stream_id, route = self.pubsub_management.create_stream('qc stream', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        config = DotDict()

        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config)
        self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id)

        publisher = StandaloneStreamPublisher(stream_id, route)
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10)
        rdt['temp'] = np.arange(10) * 3

        verified = Event()
        def verification(event, *args, **kwargs):
            self.assertEquals(event.qc_parameter, 'temp_qc')
            self.assertEquals(event.temporal_value, 7)
            verified.set()

        es = EventSubscriber(event_type=OT.ParameterQCEvent, origin=dataset_id, callback=verification, auto_delete=True)
        es.start()
        self.addCleanup(es.stop)

        publisher.publish(rdt.to_granule())
        self.assertTrue(verified.wait(10))



    def test_lookup_values_ingest_replay(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_lookups()
        stream_def_id = self.pubsub_management.create_stream_definition('lookups', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)

        stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        config = DotDict()
        config.process.lookup_docs = ['test1', 'test2']
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id, config=config)
        self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id)

        stored_value_manager = StoredValueManager(self.container)
        stored_value_manager.stored_value_cas('test1',{'offset_a':10.0, 'offset_b':13.1})
        
        publisher = StandaloneStreamPublisher(stream_id, route)
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20)
        rdt['temp'] = [20.0] * 20

        granule = rdt.to_granule()

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))
        
        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(20))
        np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20))
        np.testing.assert_array_equal(rdt_out['offset_b'], np.array([rdt_out.fill_value('offset_b')] * 20))

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(20,40)
        rdt['temp'] = [20.0] * 20
        granule = rdt.to_granule()

        dataset_monitor.event.clear()

        stored_value_manager.stored_value_cas('test1',{'offset_a':20.0})
        stored_value_manager.stored_value_cas('coefficient_document',{'offset_b':10.0})
        gevent.sleep(2)

        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(30))

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], np.arange(40))
        np.testing.assert_array_almost_equal(rdt_out['temp'], np.array([20.] * 20 + [20.] * 20))
        np.testing.assert_array_equal(rdt_out['offset_b'], np.array([10.] * 40))
        np.testing.assert_array_almost_equal(rdt_out['calibrated'], np.array([30.]*20 + [40.]*20))
        np.testing.assert_array_almost_equal(rdt_out['calibrated_b'], np.array([40.] * 20 + [50.] * 20))



    @unittest.skip('Doesnt work')
    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode')
    def test_replay_pause(self):
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        

        stream_def_id = self.pubsub_management.create_stream_definition('replay_stream', parameter_dictionary_id=pdict_id)
        replay_stream, replay_route = self.pubsub_management.create_stream('replay', 'xp1', stream_definition_id=stream_def_id)
        dataset_id = self.create_dataset(pdict_id)
        scov = DatasetManagementService._get_simplex_coverage(dataset_id)

        bb = CoverageCraft(scov)
        bb.rdt['time'] = np.arange(100)
        bb.rdt['temp'] = np.random.random(100) + 30
        bb.sync_with_granule()

        DatasetManagementService._persist_coverage(dataset_id, bb.coverage) # This invalidates it for multi-host configurations
        # Set up the subscriber to verify the data
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        xp = self.container.ex_manager.create_xp('xp1')
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        # Set up the replay agent and the client wrapper

        # 1) Define the Replay (dataset and stream to publish on)
        self.replay_id, process_id = self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream)
        # 2) Make a client to the interact with the process (optionall provide it a process to bind with)
        replay_client = ReplayClient(process_id)
        # 3) Start the agent (launch the process)
        self.data_retriever.start_replay_agent(self.replay_id)
        # 4) Start replaying...
        replay_client.start_replay()
        
        # Wait till we get some granules
        self.assertTrue(self.event.wait(5))
        
        # We got granules, pause the replay, clear the queue and allow the process to finish consuming
        replay_client.pause_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()
        
        # Make sure there's no remaining messages being consumed
        self.assertFalse(self.event.wait(1))

        # Resume the replay and wait until we start getting granules again
        replay_client.resume_replay()
        self.assertTrue(self.event.wait(5))
    
        # Stop the replay, clear the queues
        replay_client.stop_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure that it did indeed stop
        self.assertFalse(self.event.wait(1))

        subscriber.stop()


    def test_retrieve_and_transform(self):
        # Make a simple dataset and start ingestion, pretty standard stuff.
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(ctd_stream_id, dataset_id)

        # Stream definition for the salinity data
        salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        sal_stream_def_id = self.pubsub_management.create_stream_definition('sal data', parameter_dictionary_id=salinity_pdict_id)


        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10)
        rdt['temp'] = np.random.randn(10) * 10 + 30
        rdt['conductivity'] = np.random.randn(10) * 2 + 10
        rdt['pressure'] = np.random.randn(10) * 1 + 12

        publisher = StandaloneStreamPublisher(ctd_stream_id, route)
        publisher.publish(rdt.to_granule())

        rdt['time'] = np.arange(10,20)

        publisher.publish(rdt.to_granule())


        self.wait_until_we_have_enough_granules(dataset_id, 20)

        granule = self.data_retriever.retrieve(dataset_id, 
                                             None,
                                             None, 
                                             'ion.processes.data.transforms.ctd.ctd_L2_salinity',
                                             'CTDL2SalinityTransformAlgorithm', 
                                             kwargs=dict(params=sal_stream_def_id))
        rdt = RecordDictionaryTool.load_from_granule(granule)
        for i in rdt['salinity']:
            self.assertNotEquals(i,0)
        self.streams.append(ctd_stream_id)
        self.stop_ingestion(ctd_stream_id)

    def test_last_granule(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)

        self.publish_hifi(stream_id,route, 0)
        self.publish_hifi(stream_id,route, 1)
        

        self.wait_until_we_have_enough_granules(dataset_id,20) # I just need two


        success = False
        def verifier():
                replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id, 10)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(10) + 10
                if not isinstance(comp,bool):
                    return comp.all()
                return False
        success = poll(verifier)

        self.assertTrue(success)

        success = False
        def verify_points():
                replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id,5)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(15,20)
                if not isinstance(comp,bool):
                    return comp.all()
                return False
        success = poll(verify_points)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    def test_replay_with_parameters(self):
        #--------------------------------------------------------------------------------
        # Create the configurations and the dataset
        #--------------------------------------------------------------------------------
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        

        stream_def_id = self.pubsub_management.create_stream_definition('replay_stream', parameter_dictionary_id=pdict_id)
        
        stream_id, route  = self.pubsub_management.create_stream('replay_with_params', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        config_id  = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id)

        dataset_monitor = DatasetMonitor(dataset_id)

        self.addCleanup(dataset_monitor.stop)

        self.publish_fake_data(stream_id, route)

        self.assertTrue(dataset_monitor.event.wait(30))

        query = {
            'start_time': 0 - 2208988800,
            'end_time':   20 - 2208988800,
            'stride_time' : 2,
            'parameters': ['time','temp']
        }
        retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id,query=query)

        rdt = RecordDictionaryTool.load_from_granule(retrieved_data)
        comp = np.arange(0,20,2) == rdt['time']
        self.assertTrue(comp.all(),'%s' % rdt.pretty_print())
        self.assertEquals(set(rdt.iterkeys()), set(['time','temp']))

        extents = self.dataset_management.dataset_extents(dataset_id=dataset_id, parameters=['time','temp'])
        self.assertTrue(extents['time']>=20)
        self.assertTrue(extents['temp']>=20)

        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)
        

    def test_repersist_data(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        self.publish_hifi(stream_id,route,0)
        self.publish_hifi(stream_id,route,1)
        self.wait_until_we_have_enough_granules(dataset_id,20)
        config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id,ingestion_configuration_id=config_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id,ingestion_configuration_id=config_id,dataset_id=dataset_id)
        self.publish_hifi(stream_id,route,2)
        self.publish_hifi(stream_id,route,3)
        self.wait_until_we_have_enough_granules(dataset_id,40)
        success = False
        with gevent.timeout.Timeout(5):
            while not success:

                replay_granule = self.data_retriever.retrieve(dataset_id)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(0,40)
                if not isinstance(comp,bool):
                    success = comp.all()
                gevent.sleep(1)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)


    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_correct_time(self):

        # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e. 
        #  the conversion factor between unix and NTP time
        unix_now = np.floor(time.time())
        ntp_now  = unix_now + 2208988800 

        unix_ago = unix_now - 20
        ntp_ago  = unix_ago + 2208988800

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_simplex_coverage(dataset_id)
        coverage.insert_timesteps(20)
        coverage.set_parameter_values('time', np.arange(ntp_ago,ntp_now))
        
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)

        self.assertTrue( np.abs(temporal_bounds[0] - unix_ago) < 2)
        self.assertTrue( np.abs(temporal_bounds[1] - unix_now) < 2)


    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_empty_coverage_time(self):

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_coverage(dataset_id)
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)
        self.assertEquals([coverage.get_parameter_context('time').fill_value] *2, temporal_bounds)


    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_out_of_band_retrieve(self):
        # Setup the environemnt
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        
        # Fill the dataset
        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id,40)

        # Retrieve the data
        granule = DataRetrieverService.retrieve_oob(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        self.assertTrue((rdt['time'] == np.arange(40)).all())

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_simplex_coverage(dataset_id)
            coverage.insert_timesteps(10)
            coverage.set_parameter_values('time', np.arange(10))
            coverage.set_parameter_values('temp', np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)

        DataRetrieverService._refresh_interval = 100
        self.publish_hifi(stream_id,route,1)
        self.wait_until_we_have_enough_granules(dataset_id, data_size=20)
            
 
        event = gevent.event.Event()
        with gevent.Timeout(20):
            while not event.wait(0.1):
                if dataset_id not in DataRetrieverService._retrieve_cache:
                    event.set()


        self.assertTrue(event.is_set())

        
    def publish_and_wait(self, dataset_id, granule):
        stream_ids, _ = self.resource_registry.find_objects(dataset_id, PRED.hasStream,id_only=True)
        stream_id=stream_ids[0]
        route = self.pubsub_management.read_stream_route(stream_id)
        publisher = StandaloneStreamPublisher(stream_id,route)
        dataset_monitor = DatasetMonitor(dataset_id)
        publisher.publish(granule)
        self.assertTrue(dataset_monitor.event.wait(10))

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_thorough_gap_analysis(self):
        dataset_id = self.test_ingestion_gap_analysis()
        vcov = DatasetManagementService._get_coverage(dataset_id)

        self.assertIsInstance(vcov,ViewCoverage)
        ccov = vcov.reference_coverage

        self.assertIsInstance(ccov, ComplexCoverage)
        self.assertEquals(len(ccov._reference_covs), 3)


    def test_ingestion_gap_analysis(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)

        connection1 = uuid4().hex
        connection2 = uuid4().hex

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = [0]
        rdt['temp'] = [0]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index='0'))
        rdt['time'] = [1]
        rdt['temp'] = [1]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index=1))
        rdt['time'] = [2]
        rdt['temp'] = [2]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection1,connection_index='3')) # Gap, missed message
        rdt['time'] = [3]
        rdt['temp'] = [3]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index='3')) # Gap, new connection
        rdt['time'] = [4]
        rdt['temp'] = [4]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index='4'))
        rdt['time'] = [5]
        rdt['temp'] = [5]
        self.publish_and_wait(dataset_id, rdt.to_granule(connection_id=connection2,connection_index=5))

        granule = self.data_retriever.retrieve(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_equal(rdt['time'], np.arange(6))
        np.testing.assert_array_equal(rdt['temp'], np.arange(6))
        return dataset_id


    @unittest.skip('Outdated due to ingestion retry')
    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_ingestion_failover(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        
        event = Event()

        def cb(*args, **kwargs):
            event.set()

        sub = EventSubscriber(event_type="ExceptionEvent", callback=cb, origin="stream_exception")
        sub.start()

        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id, 40)
        
        file_path = DatasetManagementService._get_coverage_path(dataset_id)
        master_file = os.path.join(file_path, '%s_master.hdf5' % dataset_id)

        with open(master_file, 'w') as f:
            f.write('this will crash HDF')

        self.publish_hifi(stream_id, route, 5)


        self.assertTrue(event.wait(10))

        sub.stop()

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_coverage_types(self):
        # Make a simple dataset and start ingestion, pretty standard stuff.
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        cov = DatasetManagementService._get_coverage(dataset_id=dataset_id)
        self.assertIsInstance(cov, ViewCoverage)

        cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id)
        self.assertIsInstance(cov, SimplexCoverage)
Пример #48
0
class TestDMEnd2End(IonIntegrationTestCase):
    def setUp(self):  # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever = DataRetrieverServiceClient()
        self.pids = []
        self.event = Event()
        self.exchange_space_name = 'test_granules'
        self.exchange_point_name = 'science_data'
        self.i = 0

        self.purge_queues()
        self.queue_buffer = []
        self.streams = []
        self.addCleanup(self.stop_all_ingestion)

    def purge_queues(self):
        xn = self.container.ex_manager.create_xn_queue(
            'science_granule_ingestion')
        xn.purge()

    def tearDown(self):
        self.purge_queues()
        for pid in self.pids:
            self.container.proc_manager.terminate_process(pid)
        IngestionManagementIntTest.clean_subscriptions()
        for queue in self.queue_buffer:
            if isinstance(queue, ExchangeNameQueue):
                queue.delete()
            elif isinstance(queue, str):
                xn = self.container.ex_manager.create_xn_queue(queue)
                xn.delete()

    #--------------------------------------------------------------------------------
    # Helper/Utility methods
    #--------------------------------------------------------------------------------

    def create_dataset(self, parameter_dict_id=''):
        '''
        Creates a time-series dataset
        '''
        tdom, sdom = time_series_domain()
        sdom = sdom.dump()
        tdom = tdom.dump()
        if not parameter_dict_id:
            parameter_dict_id = self.dataset_management.read_parameter_dictionary_by_name(
                'ctd_parsed_param_dict', id_only=True)

        dataset_id = self.dataset_management.create_dataset(
            'test_dataset_%i' % self.i,
            parameter_dictionary_id=parameter_dict_id,
            spatial_domain=sdom,
            temporal_domain=tdom)
        return dataset_id

    def get_datastore(self, dataset_id):
        '''
        Gets an instance of the datastore
            This method is primarily used to defeat a bug where integration tests in multiple containers may sometimes 
            delete a CouchDB datastore and the other containers are unaware of the new state of the datastore.
        '''
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(
            datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore

    def get_ingestion_config(self):
        '''
        Grab the ingestion configuration from the resource registry
        '''
        # The ingestion configuration should have been created by the bootstrap service
        # which is configured through r2deploy.yml

        ingest_configs, _ = self.resource_registry.find_resources(
            restype=RT.IngestionConfiguration, id_only=True)
        return ingest_configs[0]

    def launch_producer(self, stream_id=''):
        '''
        Launch the producer
        '''

        pid = self.container.spawn_process(
            'better_data_producer', 'ion.processes.data.example_data_producer',
            'BetterDataProducer', {'process': {
                'stream_id': stream_id
            }})

        self.pids.append(pid)

    def make_simple_dataset(self):
        '''
        Makes a stream, a stream definition and a dataset, the essentials for most of these tests
        '''
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        stream_def_id = self.pubsub_management.create_stream_definition(
            'ctd data', parameter_dictionary_id=pdict_id)
        stream_id, route = self.pubsub_management.create_stream(
            'ctd stream %i' % self.i,
            'xp1',
            stream_definition_id=stream_def_id)

        dataset_id = self.create_dataset(pdict_id)

        self.get_datastore(dataset_id)
        self.i += 1
        return stream_id, route, stream_def_id, dataset_id

    def publish_hifi(self, stream_id, stream_route, offset=0):
        '''
        Publish deterministic data
        '''

        pub = StandaloneStreamPublisher(stream_id, stream_route)

        stream_def = self.pubsub_management.read_stream_definition(
            stream_id=stream_id)
        stream_def_id = stream_def._id
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10) + (offset * 10)
        rdt['temp'] = np.arange(10) + (offset * 10)
        pub.publish(rdt.to_granule())

    def publish_fake_data(self, stream_id, route):
        '''
        Make four granules
        '''
        for i in xrange(4):
            self.publish_hifi(stream_id, route, i)

    def start_ingestion(self, stream_id, dataset_id):
        '''
        Starts ingestion/persistence for a given dataset
        '''
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id,
            ingestion_configuration_id=ingest_config_id,
            dataset_id=dataset_id)

    def stop_ingestion(self, stream_id):
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingest_config_id)

    def stop_all_ingestion(self):
        try:
            [self.stop_ingestion(sid) for sid in self.streams]
        except:
            pass

    def validate_granule_subscription(self, msg, route, stream_id):
        '''
        Validation for granule format
        '''
        if msg == {}:
            return
        rdt = RecordDictionaryTool.load_from_granule(msg)
        log.info('%s', rdt.pretty_print())
        self.assertIsInstance(
            msg, Granule, 'Message is improperly formatted. (%s)' % type(msg))
        self.event.set()

    def wait_until_we_have_enough_granules(self, dataset_id='', data_size=40):
        '''
        Loops until there is a sufficient amount of data in the dataset
        '''
        done = False
        with gevent.Timeout(40):
            while not done:
                extents = self.dataset_management.dataset_extents(
                    dataset_id, 'time')[0]
                granule = self.data_retriever.retrieve_last_data_points(
                    dataset_id, 1)
                rdt = RecordDictionaryTool.load_from_granule(granule)
                if rdt['time'] and rdt['time'][0] != rdt._pdict.get_context(
                        'time').fill_value and extents >= data_size:
                    done = True
                else:
                    gevent.sleep(0.2)

    #--------------------------------------------------------------------------------
    # Test Methods
    #--------------------------------------------------------------------------------

    @attr('SMOKE')
    def test_dm_end_2_end(self):
        #--------------------------------------------------------------------------------
        # Set up a stream and have a mock instrument (producer) send data
        #--------------------------------------------------------------------------------
        self.event.clear()

        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(
            pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary', param_type=ArrayType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            'replay_pdict',
            parameter_context_ids=context_ids,
            temporal_context='time')

        stream_definition = self.pubsub_management.create_stream_definition(
            'ctd data', parameter_dictionary_id=pdict_id)

        stream_id, route = self.pubsub_management.create_stream(
            'producer',
            exchange_point=self.exchange_point_name,
            stream_definition_id=stream_definition)

        #--------------------------------------------------------------------------------
        # Start persisting the data on the stream
        # - Get the ingestion configuration from the resource registry
        # - Create the dataset
        # - call persist_data_stream to setup the subscription for the ingestion workers
        #   on the stream that you specify which causes the data to be persisted
        #--------------------------------------------------------------------------------

        ingest_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id,
            ingestion_configuration_id=ingest_config_id,
            dataset_id=dataset_id)

        #--------------------------------------------------------------------------------
        # Now the granules are ingesting and persisted
        #--------------------------------------------------------------------------------

        self.launch_producer(stream_id)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_id)
        self.assertIsInstance(replay_data, Granule)
        rdt = RecordDictionaryTool.load_from_granule(replay_data)
        self.assertTrue((rdt['time'][:10] == np.arange(10)).all(),
                        '%s' % rdt['time'][:])
        self.assertTrue((rdt['binary'][:10] == np.array(['hi'] * 10,
                                                        dtype='object')).all())

        #--------------------------------------------------------------------------------
        # Now to try the streamed approach
        #--------------------------------------------------------------------------------
        replay_stream_id, replay_route = self.pubsub_management.create_stream(
            'replay_out',
            exchange_point=self.exchange_point_name,
            stream_definition_id=stream_definition)
        self.replay_id, process_id = self.data_retriever.define_replay(
            dataset_id=dataset_id, stream_id=replay_stream_id)
        log.info('Process ID: %s', process_id)

        replay_client = ReplayClient(process_id)

        #--------------------------------------------------------------------------------
        # Create the listening endpoint for the the retriever to talk to
        #--------------------------------------------------------------------------------
        xp = self.container.ex_manager.create_xp(self.exchange_point_name)
        subscriber = StandaloneStreamSubscriber(
            self.exchange_space_name, self.validate_granule_subscription)
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        self.data_retriever.start_replay_agent(self.replay_id)

        self.assertTrue(replay_client.await_agent_ready(5),
                        'The process never launched')
        replay_client.start_replay()

        self.assertTrue(self.event.wait(10))
        subscriber.stop()

        self.data_retriever.cancel_replay_agent(self.replay_id)

        #--------------------------------------------------------------------------------
        # Test the slicing capabilities
        #--------------------------------------------------------------------------------

        granule = self.data_retriever.retrieve(dataset_id=dataset_id,
                                               query={'tdoa': slice(0, 5)})
        rdt = RecordDictionaryTool.load_from_granule(granule)
        b = rdt['time'] == np.arange(5)
        self.assertTrue(b.all() if not isinstance(b, bool) else b)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    @unittest.skip('Doesnt work')
    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False),
                     'Skip test while in CEI LAUNCH mode')
    def test_replay_pause(self):
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(
            pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary', param_type=ArrayType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            'replay_pdict',
            parameter_context_ids=context_ids,
            temporal_context='time')

        stream_def_id = self.pubsub_management.create_stream_definition(
            'replay_stream', parameter_dictionary_id=pdict_id)
        replay_stream, replay_route = self.pubsub_management.create_stream(
            'replay', 'xp1', stream_definition_id=stream_def_id)
        dataset_id = self.create_dataset(pdict_id)
        scov = DatasetManagementService._get_coverage(dataset_id)

        bb = CoverageCraft(scov)
        bb.rdt['time'] = np.arange(100)
        bb.rdt['temp'] = np.random.random(100) + 30
        bb.sync_with_granule()

        DatasetManagementService._persist_coverage(
            dataset_id,
            bb.coverage)  # This invalidates it for multi-host configurations
        # Set up the subscriber to verify the data
        subscriber = StandaloneStreamSubscriber(
            self.exchange_space_name, self.validate_granule_subscription)
        xp = self.container.ex_manager.create_xp('xp1')
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        # Set up the replay agent and the client wrapper

        # 1) Define the Replay (dataset and stream to publish on)
        self.replay_id, process_id = self.data_retriever.define_replay(
            dataset_id=dataset_id, stream_id=replay_stream)
        # 2) Make a client to the interact with the process (optionall provide it a process to bind with)
        replay_client = ReplayClient(process_id)
        # 3) Start the agent (launch the process)
        self.data_retriever.start_replay_agent(self.replay_id)
        # 4) Start replaying...
        replay_client.start_replay()

        # Wait till we get some granules
        self.assertTrue(self.event.wait(5))

        # We got granules, pause the replay, clear the queue and allow the process to finish consuming
        replay_client.pause_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure there's no remaining messages being consumed
        self.assertFalse(self.event.wait(1))

        # Resume the replay and wait until we start getting granules again
        replay_client.resume_replay()
        self.assertTrue(self.event.wait(5))

        # Stop the replay, clear the queues
        replay_client.stop_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure that it did indeed stop
        self.assertFalse(self.event.wait(1))

        subscriber.stop()

    def test_retrieve_and_transform(self):
        # Make a simple dataset and start ingestion, pretty standard stuff.
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        self.start_ingestion(ctd_stream_id, dataset_id)

        # Stream definition for the salinity data
        salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        sal_stream_def_id = self.pubsub_management.create_stream_definition(
            'sal data', parameter_dictionary_id=salinity_pdict_id)

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10)
        rdt['temp'] = np.random.randn(10) * 10 + 30
        rdt['conductivity'] = np.random.randn(10) * 2 + 10
        rdt['pressure'] = np.random.randn(10) * 1 + 12

        publisher = StandaloneStreamPublisher(ctd_stream_id, route)
        publisher.publish(rdt.to_granule())

        rdt['time'] = np.arange(10, 20)

        publisher.publish(rdt.to_granule())

        self.wait_until_we_have_enough_granules(dataset_id, 20)

        granule = self.data_retriever.retrieve(
            dataset_id,
            None,
            None,
            'ion.processes.data.transforms.ctd.ctd_L2_salinity',
            'CTDL2SalinityTransformAlgorithm',
            kwargs=dict(params=sal_stream_def_id))
        rdt = RecordDictionaryTool.load_from_granule(granule)
        for i in rdt['salinity']:
            self.assertNotEquals(i, 0)
        self.streams.append(ctd_stream_id)
        self.stop_ingestion(ctd_stream_id)

    def test_last_granule(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        self.start_ingestion(stream_id, dataset_id)

        self.publish_hifi(stream_id, route, 0)
        self.publish_hifi(stream_id, route, 1)

        self.wait_until_we_have_enough_granules(dataset_id,
                                                20)  # I just need two

        success = False

        def verifier():
            replay_granule = self.data_retriever.retrieve_last_data_points(
                dataset_id, 10)

            rdt = RecordDictionaryTool.load_from_granule(replay_granule)

            comp = rdt['time'] == np.arange(10) + 10
            if not isinstance(comp, bool):
                return comp.all()
            return False

        success = poll(verifier)

        self.assertTrue(success)

        success = False

        def verify_points():
            replay_granule = self.data_retriever.retrieve_last_data_points(
                dataset_id, 5)

            rdt = RecordDictionaryTool.load_from_granule(replay_granule)

            comp = rdt['time'] == np.arange(15, 20)
            if not isinstance(comp, bool):
                return comp.all()
            return False

        success = poll(verify_points)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    def test_replay_with_parameters(self):
        #--------------------------------------------------------------------------------
        # Create the configurations and the dataset
        #--------------------------------------------------------------------------------
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            'ctd_parsed_param_dict', id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(
            pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary', param_type=ArrayType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(
            self.dataset_management.create_parameter_context(
                'records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            'replay_pdict',
            parameter_context_ids=context_ids,
            temporal_context='time')

        stream_def_id = self.pubsub_management.create_stream_definition(
            'replay_stream', parameter_dictionary_id=pdict_id)

        stream_id, route = self.pubsub_management.create_stream(
            'replay_with_params',
            exchange_point=self.exchange_point_name,
            stream_definition_id=stream_def_id)
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id,
            ingestion_configuration_id=config_id,
            dataset_id=dataset_id)

        dataset_modified = Event()

        def cb(*args, **kwargs):
            dataset_modified.set()

        es = EventSubscriber(event_type=OT.DatasetModified,
                             callback=cb,
                             origin=dataset_id)
        es.start()

        self.addCleanup(es.stop)

        self.publish_fake_data(stream_id, route)

        self.assertTrue(dataset_modified.wait(30))

        query = {
            'start_time': 0 - 2208988800,
            'end_time': 20 - 2208988800,
            'stride_time': 2,
            'parameters': ['time', 'temp']
        }
        retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id,
                                                      query=query)

        rdt = RecordDictionaryTool.load_from_granule(retrieved_data)
        comp = np.arange(0, 20, 2) == rdt['time']
        self.assertTrue(comp.all(), '%s' % rdt.pretty_print())
        self.assertEquals(set(rdt.iterkeys()), set(['time', 'temp']))

        extents = self.dataset_management.dataset_extents(
            dataset_id=dataset_id, parameters=['time', 'temp'])
        self.assertTrue(extents['time'] >= 20)
        self.assertTrue(extents['temp'] >= 20)

        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    def test_repersist_data(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        self.start_ingestion(stream_id, dataset_id)
        self.publish_hifi(stream_id, route, 0)
        self.publish_hifi(stream_id, route, 1)
        self.wait_until_we_have_enough_granules(dataset_id, 20)
        config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id,
            ingestion_configuration_id=config_id,
            dataset_id=dataset_id)
        self.publish_hifi(stream_id, route, 2)
        self.publish_hifi(stream_id, route, 3)
        self.wait_until_we_have_enough_granules(dataset_id, 40)
        success = False
        with gevent.timeout.Timeout(5):
            while not success:

                replay_granule = self.data_retriever.retrieve(dataset_id)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(0, 40)
                if not isinstance(comp, bool):
                    success = comp.all()
                gevent.sleep(1)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv(
        'CEI_LAUNCH_TEST', False
    ), 'Host requires file-system access to coverage files, CEI mode does not support.'
                     )
    def test_correct_time(self):

        # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e.
        #  the conversion factor between unix and NTP time
        unix_now = np.floor(time.time())
        ntp_now = unix_now + 2208988800

        unix_ago = unix_now - 20
        ntp_ago = unix_ago + 2208988800

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        coverage = DatasetManagementService._get_coverage(dataset_id)
        coverage.insert_timesteps(20)
        coverage.set_parameter_values('time', np.arange(ntp_ago, ntp_now))

        temporal_bounds = self.dataset_management.dataset_temporal_bounds(
            dataset_id)

        self.assertTrue(np.abs(temporal_bounds[0] - unix_ago) < 2)
        self.assertTrue(np.abs(temporal_bounds[1] - unix_now) < 2)

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv(
        'CEI_LAUNCH_TEST', False
    ), 'Host requires file-system access to coverage files, CEI mode does not support.'
                     )
    def test_empty_coverage_time(self):

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        coverage = DatasetManagementService._get_coverage(dataset_id)
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(
            dataset_id)
        self.assertEquals([coverage.get_parameter_context('time').fill_value] *
                          2, temporal_bounds)

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv(
        'CEI_LAUNCH_TEST', False
    ), 'Host requires file-system access to coverage files, CEI mode does not support.'
                     )
    def test_out_of_band_retrieve(self):
        # Setup the environemnt
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        self.start_ingestion(stream_id, dataset_id)

        # Fill the dataset
        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        # Retrieve the data
        granule = DataRetrieverService.retrieve_oob(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        self.assertTrue((rdt['time'] == np.arange(40)).all())

    @attr('LOCOINT')
    @unittest.skipIf(os.getenv(
        'CEI_LAUNCH_TEST', False
    ), 'Host requires file-system access to coverage files, CEI mode does not support.'
                     )
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_coverage(dataset_id)
            coverage.insert_timesteps(10)
            coverage.set_parameter_values('time', np.arange(10))
            coverage.set_parameter_values('temp', np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(
            dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(
            dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)

        DataRetrieverService._refresh_interval = 100
        self.publish_hifi(stream_id, route, 1)
        self.wait_until_we_have_enough_granules(dataset_id, data_size=20)

        event = gevent.event.Event()
        with gevent.Timeout(20):
            while not event.wait(0.1):
                if dataset_id not in DataRetrieverService._retrieve_cache:
                    event.set()

        self.assertTrue(event.is_set())

    @unittest.skip('Outdated due to ingestion retry')
    @attr('LOCOINT')
    @unittest.skipIf(os.getenv(
        'CEI_LAUNCH_TEST', False
    ), 'Host requires file-system access to coverage files, CEI mode does not support.'
                     )
    def test_ingestion_failover(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset(
        )
        self.start_ingestion(stream_id, dataset_id)

        event = Event()

        def cb(*args, **kwargs):
            event.set()

        sub = EventSubscriber(event_type="ExceptionEvent",
                              callback=cb,
                              origin="stream_exception")
        sub.start()

        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        file_path = DatasetManagementService._get_coverage_path(dataset_id)
        master_file = os.path.join(file_path, '%s_master.hdf5' % dataset_id)

        with open(master_file, 'w') as f:
            f.write('this will crash HDF')

        self.publish_hifi(stream_id, route, 5)

        self.assertTrue(event.wait(10))

        sub.stop()
class TestDataProductManagementServiceIntegration(IonIntegrationTestCase):

    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli = DataProductManagementServiceClient(node=self.container.node)
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubcli =  PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        datastore_name = CACHE_DATASTORE_NAME
        self.db = self.container.datastore_manager.get_datastore(datastore_name)
        self.stream_def_id = self.pubsubcli.create_stream_definition(name='SBE37_CDM')

        self.process_definitions  = {}
        ingestion_worker_definition = ProcessDefinition(name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class' :'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space       = 'science_granule_ingestion'
        self.exchange_point       = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(self.process_definitions['ingestion_worker'],configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)

    def cleaning_up(self):
        for pid in self.pids:
            log.debug("number of pids to be terminated: %s", len(self.pids))
            try:
                self.process_dispatcher.cancel_process(pid)
                log.debug("Terminated the process: %s", pid)
            except:
                log.debug("could not terminate the process id: %s" % pid)
        IngestionManagementIntTest.clean_subscriptions()

        for xn in self.exchange_names:
            xni = self.container.ex_manager.create_xn_queue(xn)
            xni.delete()
        for xp in self.exchange_points:
            xpi = self.container.ex_manager.create_xp(xp)
            xpi.delete()

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore


    def test_create_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        parameter_dictionary_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict')
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=parameter_dictionary_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------

        # Generic time-series data domain creation
        tdom, sdom = time_series_domain()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom.dump(), 
            spatial_domain = sdom.dump())

        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 200.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = 100.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 50.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = 100.0

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product( data_product= dp_obj,
                                            stream_definition_id=ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)
        self.assertEquals(dp_obj.geospatial_point_center.lat, 150.0)
        log.debug('Created data product %s', dp_obj)
        #------------------------------------------------------------------------------------------------
        # test creating a new data product with  a stream definition
        #------------------------------------------------------------------------------------------------
        log.debug('Creating new data product with a stream definition')
        dp_obj = IonObject(RT.DataProduct,
            name='DP2',
            description='some new dp',
            temporal_domain = tdom.dump(),
            spatial_domain = sdom.dump())

        dp_id2 = self.dpsc_cli.create_data_product(dp_obj, ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id2)
        log.debug('new dp_id = %s' % dp_id2)

        #------------------------------------------------------------------------------------------------
        #make sure data product is associated with stream def
        #------------------------------------------------------------------------------------------------
        streamdefs = []
        streams, _ = self.rrclient.find_objects(dp_id2, PRED.hasStream, RT.Stream, True)
        for s in streams:
            log.debug("Checking stream %s" % s)
            sdefs, _ = self.rrclient.find_objects(s, PRED.hasStreamDefinition, RT.StreamDefinition, True)
            for sd in sdefs:
                log.debug("Checking streamdef %s" % sd)
                streamdefs.append(sd)
        self.assertIn(ctd_stream_def_id, streamdefs)


        # test reading a non-existent data product
        log.debug('reading non-existent data product')

        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product('some_fake_id')

        # update a data product (tests read also)
        log.debug('Updating data product')
        # first get the existing dp object
        dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # now tweak the object
        dp_obj.description = 'the very first dp'
        dp_obj.geospatial_bounds.geospatial_latitude_limit_north = 300.0
        dp_obj.geospatial_bounds.geospatial_latitude_limit_south = 200.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_east = 150.0
        dp_obj.geospatial_bounds.geospatial_longitude_limit_west = 200.0
        # now write the dp back to the registry
        update_result = self.dpsc_cli.update_data_product(dp_obj)


        # now get the dp back to see if it was updated
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertEquals(dp_obj.description,'the very first dp')
        self.assertEquals(dp_obj.geospatial_point_center.lat, 250.0)
        log.debug('Updated data product %s', dp_obj)

        #test extension
        extended_product = self.dpsc_cli.get_data_product_extension(dp_id)
        self.assertEqual(dp_id, extended_product._id)
        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.product_download_size_estimated.status)
        self.assertEqual(0, extended_product.computed.product_download_size_estimated.value)

        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.parameters.status)
        #log.debug("test_create_data_product: parameters %s" % extended_product.computed.parameters.value)

        # now 'delete' the data product
        log.debug("deleting data product: %s" % dp_id)
        self.dpsc_cli.delete_data_product(dp_id)
        self.dpsc_cli.force_delete_data_product(dp_id)

        # now try to get the deleted dp object
        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # Get the events corresponding to the data product
        ret = self.unsc.get_recent_events(resource_id=dp_id)
        events = ret.value

        for event in events:
            log.debug("event time: %s" % event.ts_created)

        self.assertTrue(len(events) > 0)

    def test_data_product_stream_def(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)

        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)
        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        stream_def_id = self.dpsc_cli.get_data_product_stream_definition(dp_id)
        self.assertEquals(ctd_stream_def_id, stream_def_id)



    def test_activate_suspend_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------
        # Construct temporal and spatial Coordinate Reference System objects
        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test activate and suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.activate_data_product_persistence(dp_id)
        
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        self.get_datastore(dataset_ids[0])


        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_ids[0])
        self.assertIsInstance(replay_data, Granule)

        log.debug("The data retriever was able to replay the dataset that was attached to the data product "
                  "we wanted to be persisted. Therefore the data product was indeed persisted with "
                  "otherwise we could not have retrieved its dataset using the data retriever. Therefore "
                  "this demonstration shows that L4-CI-SA-RQ-267 is satisfied: 'Data product management shall persist data products'")

        data_product_object = self.rrclient.read(dp_id)
        self.assertEquals(data_product_object.name,'DP1')
        self.assertEquals(data_product_object.description,'some new dp')

        log.debug("Towards L4-CI-SA-RQ-308: 'Data product management shall persist data product metadata'. "
                  " Attributes in create for the data product obj, name= '%s', description='%s', match those of object from the "
                  "resource registry, name='%s', desc='%s'" % (dp_obj.name, dp_obj.description,data_product_object.name,
                                                           data_product_object.description))

        #------------------------------------------------------------------------------------------------
        # test suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.suspend_data_product_persistence(dp_id)

        self.dpsc_cli.force_delete_data_product(dp_id)
        # now try to get the deleted dp object

        with self.assertRaises(NotFound):
            dp_obj = self.rrclient.read(dp_id)
Пример #50
0
    def test_raw_stream_integration(self):
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here to run as a script (don't forget the imports of course!)
        #-----------------------------

        # Create some service clients...
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        # declare some handy variables

        datastore_name = 'test_dm_integration'

        ###
        ### In the beginning there was one stream definitions...
        ###
        # create a stream definition for the data from the ctd simulator
        raw_ctd_stream_def = SBE37_RAW_stream_definition()
        raw_ctd_stream_def_id = pubsub_management_service.create_stream_definition(
            container=raw_ctd_stream_def, name='Simulated RAW CTD data')

        ###
        ### And two process definitions...
        ###
        # one for the ctd simulator...
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module': 'ion.processes.data.raw_stream_publisher',
            'class': 'RawStreamPublisher'
        }

        raw_ctd_sim_procdef_id = process_dispatcher.create_process_definition(
            process_definition=producer_definition)

        #---------------------------
        # Set up ingestion - this is an operator concern - not done by SA in a deployed system
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            number_of_workers=1)
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        #---------------------------
        # Set up the producer (CTD Simulator)
        #---------------------------

        # Create the stream
        raw_ctd_stream_id = pubsub_management_service.create_stream(
            stream_definition_id=raw_ctd_stream_def_id)

        # Set up the datasets
        raw_ctd_dataset_id = dataset_management_service.create_dataset(
            stream_id=raw_ctd_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule')

        # Configure ingestion of this dataset
        raw_ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id=raw_ctd_dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=
            ingestion_configuration_id,  # you need to know the ingestion configuration id!
        )
        # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service

        # Start the ctd simulator to produce some data
        configuration = {
            'process': {
                'stream_id': raw_ctd_stream_id,
            }
        }
        raw_sim_pid = process_dispatcher.schedule_process(
            process_definition_id=raw_ctd_sim_procdef_id,
            configuration=configuration)

        ###
        ### Make a subscriber in the test to listen for salinity data
        ###
        raw_subscription_id = pubsub_management_service.create_subscription(
            query=StreamQuery([
                raw_ctd_stream_id,
            ]),
            exchange_name='raw_test',
            name="test raw subscription",
        )

        # this is okay - even in cei mode!
        pid = cc.spawn_process(name='dummy_process_for_test',
                               module='pyon.ion.process',
                               cls='SimpleProcess',
                               config={})
        dummy_process = cc.proc_manager.procs[pid]

        subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process,
                                                         node=cc.node)

        result = gevent.event.AsyncResult()
        results = []

        def message_received(message, headers):
            # Heads
            log.warn('Raw data received!')
            results.append(message)
            if len(results) > 3:
                result.set(True)

        subscriber = subscriber_registrar.create_subscriber(
            exchange_name='raw_test', callback=message_received)
        subscriber.start()

        # after the queue has been created it is safe to activate the subscription
        pubsub_management_service.activate_subscription(
            subscription_id=raw_subscription_id)

        # Assert that we have received data
        assertions(result.get(timeout=10))

        # stop the flow parse the messages...
        process_dispatcher.cancel_process(
            raw_sim_pid
        )  # kill the ctd simulator process - that is enough data

        gevent.sleep(1)

        for message in results:

            sha1 = message.identifiables['stream_encoding'].sha1

            data = message.identifiables['data_stream'].values

            filename = FileSystem.get_hierarchical_url(FS.CACHE, sha1, ".raw")

            with open(filename, 'r') as f:

                assertions(data == f.read())
Пример #51
0
class TestDMEnd2End(IonIntegrationTestCase):
    def setUp(self): # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.pubsub_management    = PubsubManagementServiceClient()
        self.resource_registry    = ResourceRegistryServiceClient()
        self.dataset_management   = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever       = DataRetrieverServiceClient()
        self.event                = Event()
        self.exchange_space_name  = 'test_granules'
        self.exchange_point_name  = 'science_data'       
        self.i                    = 0
        self.cci                  = 0

    #--------------------------------------------------------------------------------
    # Helper/Utility methods
    #--------------------------------------------------------------------------------
        
    def create_dataset(self, parameter_dict_id=''):
        '''
        Creates a time-series dataset
        '''
        if not parameter_dict_id:
            parameter_dict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)

        dataset = Dataset('test_dataset_%i'%self.i)
        dataset_id = self.dataset_management.create_dataset(dataset, parameter_dictionary_id=parameter_dict_id)
        self.addCleanup(self.dataset_management.delete_dataset, dataset_id)
        return dataset_id
    
    def get_datastore(self, dataset_id):
        '''
        Gets an instance of the datastore
            This method is primarily used to defeat a bug where integration tests in multiple containers may sometimes 
            delete a CouchDB datastore and the other containers are unaware of the new state of the datastore.
        '''
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore
    
    def get_ingestion_config(self):
        '''
        Grab the ingestion configuration from the resource registry
        '''
        # The ingestion configuration should have been created by the bootstrap service 
        # which is configured through r2deploy.yml

        ingest_configs, _  = self.resource_registry.find_resources(restype=RT.IngestionConfiguration,id_only=True)
        return ingest_configs[0]

    def launch_producer(self, stream_id=''):
        '''
        Launch the producer
        '''
        pid = self.container.spawn_process('better_data_producer', 'ion.processes.data.example_data_producer', 'BetterDataProducer', {'process':{'stream_id':stream_id}})
        self.addCleanup(self.container.terminate_process, pid)

    def make_simple_dataset(self):
        '''
        Makes a stream, a stream definition and a dataset, the essentials for most of these tests
        '''
        pdict_id             = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        stream_def_id        = self.pubsub_management.create_stream_definition('ctd data %i' % self.i, parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)
        stream_id, route     = self.pubsub_management.create_stream('ctd stream %i' % self.i, 'xp1', stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        dataset_id = self.create_dataset(pdict_id)

        # self.get_datastore(dataset_id)
        self.i += 1
        return stream_id, route, stream_def_id, dataset_id

    def publish_hifi(self,stream_id,stream_route,offset=0):
        '''
        Publish deterministic data
        '''

        pub = StandaloneStreamPublisher(stream_id, stream_route)

        stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id)
        stream_def_id = stream_def._id
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10) + (offset * 10)
        rdt['temp'] = np.arange(10) + (offset * 10)
        pub.publish(rdt.to_granule())

    def publish_fake_data(self,stream_id, route):
        '''
        Make four granules
        '''
        for i in xrange(4):
            self.publish_hifi(stream_id,route,i)

    def start_ingestion(self, stream_id, dataset_id):
        '''
        Starts ingestion/persistence for a given dataset
        '''
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id)
    
    def stop_ingestion(self, stream_id):
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id)

    def validate_granule_subscription(self, msg, route, stream_id):
        '''
        Validation for granule format
        '''
        if msg == {}:
            return
        rdt = RecordDictionaryTool.load_from_granule(msg)
        log.info('%s', rdt.pretty_print())
        self.assertIsInstance(msg,Granule,'Message is improperly formatted. (%s)' % type(msg))
        self.event.set()

    def wait_until_we_have_enough_granules(self, dataset_id='',data_size=40):
        '''
        Loops until there is a sufficient amount of data in the dataset
        '''
        done = False
        with gevent.Timeout(40):
            while not done:
                extents = self.dataset_management.dataset_extents(dataset_id, 'time')
                granule = self.data_retriever.retrieve_last_data_points(dataset_id, 1)
                rdt     = RecordDictionaryTool.load_from_granule(granule)
                if rdt['time'] and rdt['time'][0] != rdt._pdict.get_context('time').fill_value and extents >= data_size:
                    done = True
                else:
                    gevent.sleep(0.2)


    #--------------------------------------------------------------------------------
    # Test Methods
    #--------------------------------------------------------------------------------

    def test_dm_end_2_end(self):
        #--------------------------------------------------------------------------------
        # Set up a stream and have a mock instrument (producer) send data
        #--------------------------------------------------------------------------------
        self.event.clear()

        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        
        stream_definition = self.pubsub_management.create_stream_definition('ctd data', parameter_dictionary_id=pdict_id)


        stream_id, route = self.pubsub_management.create_stream('producer', exchange_point=self.exchange_point_name, stream_definition_id=stream_definition)

        #--------------------------------------------------------------------------------
        # Start persisting the data on the stream 
        # - Get the ingestion configuration from the resource registry
        # - Create the dataset
        # - call persist_data_stream to setup the subscription for the ingestion workers
        #   on the stream that you specify which causes the data to be persisted
        #--------------------------------------------------------------------------------

        ingest_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)

        #--------------------------------------------------------------------------------
        # Now the granules are ingesting and persisted
        #--------------------------------------------------------------------------------

        self.launch_producer(stream_id)
        self.wait_until_we_have_enough_granules(dataset_id,40)
        
        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------
        
        replay_data = self.data_retriever.retrieve(dataset_id)
        self.assertIsInstance(replay_data, Granule)
        rdt = RecordDictionaryTool.load_from_granule(replay_data)
        self.assertTrue((rdt['time'][:10] == np.arange(10)).all(),'%s' % rdt['time'][:])
        self.assertTrue((rdt['binary'][:10] == np.array(['hi']*10, dtype='object')).all())

        
        #--------------------------------------------------------------------------------
        # Now to try the streamed approach
        #--------------------------------------------------------------------------------
        replay_stream_id, replay_route = self.pubsub_management.create_stream('replay_out', exchange_point=self.exchange_point_name, stream_definition_id=stream_definition)
        self.replay_id, process_id =  self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream_id)
        log.info('Process ID: %s', process_id)

        replay_client = ReplayClient(process_id)

    
        #--------------------------------------------------------------------------------
        # Create the listening endpoint for the the retriever to talk to 
        #--------------------------------------------------------------------------------
        sub_id = self.pubsub_management.create_subscription(self.exchange_space_name,stream_ids=[replay_stream_id])
        self.addCleanup(self.pubsub_management.delete_subscription, sub_id)
        self.pubsub_management.activate_subscription(sub_id)
        self.addCleanup(self.pubsub_management.deactivate_subscription, sub_id)
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        subscriber.start()
        self.addCleanup(subscriber.stop)

        self.data_retriever.start_replay_agent(self.replay_id)

        self.assertTrue(replay_client.await_agent_ready(5), 'The process never launched')
        replay_client.start_replay()
        
        self.assertTrue(self.event.wait(10))

        self.data_retriever.cancel_replay_agent(self.replay_id)


        #--------------------------------------------------------------------------------
        # Test the slicing capabilities
        #--------------------------------------------------------------------------------

        granule = self.data_retriever.retrieve(dataset_id=dataset_id, query={'tdoa':slice(0,5)})
        rdt = RecordDictionaryTool.load_from_granule(granule)
        b = rdt['time'] == np.arange(5)
        self.assertTrue(b.all() if not isinstance(b,bool) else b)


    def test_coverage_transform(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_parsed()
        stream_def_id = self.pubsub_management.create_stream_definition('ctd parsed', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)

        stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)

        ingestion_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)

        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=ingestion_config_id, dataset_id=dataset_id)
        self.addCleanup(self.ingestion_management.unpersist_data_stream, stream_id, ingestion_config_id)
        publisher = StandaloneStreamPublisher(stream_id, route)
        
        rdt = ph.get_rdt(stream_def_id)
        ph.fill_parsed_rdt(rdt)

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_monitor.wait())

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_array_almost_equal(rdt_out['time'], rdt['time'])
        np.testing.assert_array_almost_equal(rdt_out['temp'], rdt['temp'])

        np.testing.assert_allclose(rdt_out['conductivity_L1'], np.array([42.914]))
        np.testing.assert_allclose(rdt_out['temp_L1'], np.array([20.]))
        np.testing.assert_allclose(rdt_out['pressure_L1'], np.array([3.068]))
        np.testing.assert_allclose(rdt_out['density'], np.array([1021.7144739593881], dtype='float32'))
        np.testing.assert_allclose(rdt_out['salinity'], np.array([30.935132729668283], dtype='float32'))


    def test_ingestion_pause(self):
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        ingestion_config_id = self.get_ingestion_config()
        self.start_ingestion(ctd_stream_id, dataset_id)
        self.addCleanup(self.stop_ingestion, ctd_stream_id)

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt['time'] = np.arange(10)

        publisher = StandaloneStreamPublisher(ctd_stream_id, route)
        monitor = DatasetMonitor(dataset_id)
        self.addCleanup(monitor.stop)
        publisher.publish(rdt.to_granule())
        self.assertTrue(monitor.wait())
        granule = self.data_retriever.retrieve(dataset_id)


        self.ingestion_management.pause_data_stream(ctd_stream_id, ingestion_config_id)

        monitor.event.clear()
        rdt['time'] = np.arange(10,20)
        publisher.publish(rdt.to_granule())
        self.assertFalse(monitor.event.wait(1))

        self.ingestion_management.resume_data_stream(ctd_stream_id, ingestion_config_id)

        self.assertTrue(monitor.wait())

        granule = self.data_retriever.retrieve(dataset_id)
        rdt2 = RecordDictionaryTool.load_from_granule(granule)
        np.testing.assert_array_almost_equal(rdt2['time'], np.arange(20))

    def test_last_granule(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)

        self.publish_hifi(stream_id,route, 0)
        self.publish_hifi(stream_id,route, 1)
        

        self.wait_until_we_have_enough_granules(dataset_id,20) # I just need two


        success = False
        def verifier():
                replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id, 10)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(10) + 10
                if not isinstance(comp,bool):
                    return comp.all()
                return False
        success = poll(verifier)

        self.assertTrue(success)

        success = False
        def verify_points():
                replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id,5)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(15,20)
                if not isinstance(comp,bool):
                    return comp.all()
                return False
        success = poll(verify_points)

        self.assertTrue(success)

    def test_replay_with_parameters(self):
        #--------------------------------------------------------------------------------
        # Create the configurations and the dataset
        #--------------------------------------------------------------------------------
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        

        stream_def_id = self.pubsub_management.create_stream_definition('replay_stream', parameter_dictionary_id=pdict_id)
        
        stream_id, route  = self.pubsub_management.create_stream('replay_with_params', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        config_id  = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)

        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)

        self.publish_fake_data(stream_id, route)

        self.assertTrue(dataset_monitor.wait())

        query = {
            'start_time': 0 - 2208988800,
            'end_time':   19 - 2208988800,
            'stride_time' : 2,
            'parameters': ['time','temp']
        }
        retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id,query=query)

        rdt = RecordDictionaryTool.load_from_granule(retrieved_data)
        np.testing.assert_array_equal(rdt['time'], np.arange(0,20,2))
        self.assertEquals(set(rdt.iterkeys()), set(['time','temp']))

        extents = self.dataset_management.dataset_extents(dataset_id=dataset_id, parameters=['time','temp'])
        self.assertTrue(extents['time']>=20)
        self.assertTrue(extents['temp']>=20)

    def test_repersist_data(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        self.publish_hifi(stream_id,route,0)
        self.publish_hifi(stream_id,route,1)
        self.wait_until_we_have_enough_granules(dataset_id,20)
        config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id,ingestion_configuration_id=config_id)
        self.ingestion_management.persist_data_stream(stream_id=stream_id,ingestion_configuration_id=config_id,dataset_id=dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)
        self.publish_hifi(stream_id,route,2)
        self.publish_hifi(stream_id,route,3)
        self.wait_until_we_have_enough_granules(dataset_id,40)
        success = False
        with gevent.timeout.Timeout(5):
            while not success:

                replay_granule = self.data_retriever.retrieve(dataset_id)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt['time'] == np.arange(0,40)
                if not isinstance(comp,bool):
                    success = comp.all()
                gevent.sleep(1)

        self.assertTrue(success)


    @unittest.skip('deprecated')
    def test_correct_time(self):

        # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e. 
        #  the conversion factor between unix and NTP time
        unix_now = np.floor(time.time())
        ntp_now  = unix_now + 2208988800 

        unix_ago = unix_now - 20
        ntp_ago  = unix_ago + 2208988800

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
        coverage.insert_timesteps(20)
        coverage.set_parameter_values('time', np.arange(ntp_ago,ntp_now))
        
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)

        self.assertTrue( np.abs(temporal_bounds[0] - unix_ago) < 2)
        self.assertTrue( np.abs(temporal_bounds[1] - unix_now) < 2)


    @attr('LOCOINT')
    @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Host requires file-system access to coverage files, CEI mode does not support.')
    def test_out_of_band_retrieve(self):
        # Setup the environemnt
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        
        # Fill the dataset
        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id,40)

        # Retrieve the data
        granule = DataRetrieverService.retrieve_oob(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        self.assertTrue((rdt['time'] == np.arange(40)).all())

    def publish_and_wait(self, dataset_id, granule):
        stream_ids, _ = self.resource_registry.find_objects(dataset_id, PRED.hasStream,id_only=True)
        stream_id=stream_ids[0]
        route = self.pubsub_management.read_stream_route(stream_id)
        publisher = StandaloneStreamPublisher(stream_id,route)
        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)
        publisher.publish(granule)
        self.assertTrue(dataset_monitor.wait())


    def test_sparse_values(self):
        ph = ParameterHelper(self.dataset_management, self.addCleanup)
        pdict_id = ph.create_sparse()
        stream_def_id = self.pubsub_management.create_stream_definition('sparse', parameter_dictionary_id=pdict_id)
        self.addCleanup(self.pubsub_management.delete_stream_definition, stream_def_id)
        stream_id, route = self.pubsub_management.create_stream('example', exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id)
        self.addCleanup(self.pubsub_management.delete_stream, stream_id)
        dataset_id = self.create_dataset(pdict_id)
        self.start_ingestion(stream_id,dataset_id)
        self.addCleanup(self.stop_ingestion, stream_id)

        # Publish initial granule
        # the first one has the sparse value set inside it, sets lat to 45 and lon to -71
        ntp_now = time.time() + 2208988800
        rdt = ph.get_rdt(stream_def_id)
        rdt['time'] = [ntp_now]
        rdt['internal_timestamp'] = [ntp_now]
        rdt['temp'] = [300000]
        rdt['preferred_timestamp'] = ['driver_timestamp']
        rdt['port_timestamp'] = [ntp_now]
        rdt['quality_flag'] = ['']
        rdt['lat'] = [45]
        rdt['conductivity'] = [4341400]
        rdt['driver_timestamp'] = [ntp_now]
        rdt['lon'] = [-71]
        rdt['pressure'] = [256.8]

        publisher = StandaloneStreamPublisher(stream_id, route)
        dataset_monitor = DatasetMonitor(dataset_id)
        self.addCleanup(dataset_monitor.stop)
        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_monitor.wait())
        dataset_monitor.reset()

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        # Check the values and make sure they're correct
        np.testing.assert_allclose(rdt_out['time'], rdt['time'])
        np.testing.assert_allclose(rdt_out['temp'], rdt['temp'])
        np.testing.assert_allclose(rdt_out['lat'], np.array([45]))
        np.testing.assert_allclose(rdt_out['lon'], np.array([-71]))

        np.testing.assert_allclose(rdt_out['conductivity_L1'], np.array([42.914]))
        np.testing.assert_allclose(rdt_out['temp_L1'], np.array([20.]))
        np.testing.assert_allclose(rdt_out['pressure_L1'], np.array([3.068]))
        np.testing.assert_allclose(rdt_out['density'], np.array([1021.7144739593881], dtype='float32'))
        np.testing.assert_allclose(rdt_out['salinity'], np.array([30.935132729668283], dtype='float32'))


        # We're going to change the lat/lon
        rdt = ph.get_rdt(stream_def_id)
        rdt['time'] = time.time() + 2208988800
        rdt['lat'] = [46]
        rdt['lon'] = [-73]
        
        publisher.publish(rdt.to_granule())
        self.assertTrue(dataset_monitor.wait())
        dataset_monitor.reset()


        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_allclose(rdt_out['time'], rdt['time'])
        
        for i in xrange(9):
            ntp_now = time.time() + 2208988800
            rdt['time'] = [ntp_now]
            rdt['internal_timestamp'] = [ntp_now]
            rdt['temp'] = [300000]
            rdt['preferred_timestamp'] = ['driver_timestamp']
            rdt['port_timestamp'] = [ntp_now]
            rdt['quality_flag'] = [None]
            rdt['conductivity'] = [4341400]
            rdt['driver_timestamp'] = [ntp_now]
            rdt['pressure'] = [256.8]

            publisher.publish(rdt.to_granule())
            self.assertTrue(dataset_monitor.wait())
            dataset_monitor.reset()

        replay_granule = self.data_retriever.retrieve(dataset_id)
        rdt_out = RecordDictionaryTool.load_from_granule(replay_granule)

        np.testing.assert_allclose(rdt_out['pressure'], np.array([256.8] * 10))
        np.testing.assert_allclose(rdt_out['lat'], np.array([45] + [46] * 9))
        np.testing.assert_allclose(rdt_out['lon'], np.array([-71] + [-73] * 9))
Пример #52
0
class TestDMEnd2End(IonIntegrationTestCase):
    def setUp(self):  # Love the non pep-8 convention
        self._start_container()

        self.container.start_rel_from_url("res/deploy/r2deploy.yml")

        self.process_dispatcher = ProcessDispatcherServiceClient()
        self.pubsub_management = PubsubManagementServiceClient()
        self.resource_registry = ResourceRegistryServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.ingestion_management = IngestionManagementServiceClient()
        self.data_retriever = DataRetrieverServiceClient()
        self.pids = []
        self.event = Event()
        self.exchange_space_name = "test_granules"
        self.exchange_point_name = "science_data"
        self.i = 0

        self.purge_queues()
        self.queue_buffer = []
        self.streams = []
        self.addCleanup(self.stop_all_ingestion)

    def purge_queues(self):
        xn = self.container.ex_manager.create_xn_queue("science_granule_ingestion")
        xn.purge()

    def tearDown(self):
        self.purge_queues()
        for pid in self.pids:
            self.container.proc_manager.terminate_process(pid)
        IngestionManagementIntTest.clean_subscriptions()
        for queue in self.queue_buffer:
            if isinstance(queue, ExchangeNameQueue):
                queue.delete()
            elif isinstance(queue, str):
                xn = self.container.ex_manager.create_xn_queue(queue)
                xn.delete()

    # --------------------------------------------------------------------------------
    # Helper/Utility methods
    # --------------------------------------------------------------------------------

    def create_dataset(self, parameter_dict_id=""):
        """
        Creates a time-series dataset
        """
        tdom, sdom = time_series_domain()
        sdom = sdom.dump()
        tdom = tdom.dump()
        if not parameter_dict_id:
            parameter_dict_id = self.dataset_management.read_parameter_dictionary_by_name(
                "ctd_parsed_param_dict", id_only=True
            )

        dataset_id = self.dataset_management.create_dataset(
            "test_dataset_%i" % self.i,
            parameter_dictionary_id=parameter_dict_id,
            spatial_domain=sdom,
            temporal_domain=tdom,
        )
        return dataset_id

    def get_datastore(self, dataset_id):
        """
        Gets an instance of the datastore
            This method is primarily used to defeat a bug where integration tests in multiple containers may sometimes 
            delete a CouchDB datastore and the other containers are unaware of the new state of the datastore.
        """
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore

    def get_ingestion_config(self):
        """
        Grab the ingestion configuration from the resource registry
        """
        # The ingestion configuration should have been created by the bootstrap service
        # which is configured through r2deploy.yml

        ingest_configs, _ = self.resource_registry.find_resources(restype=RT.IngestionConfiguration, id_only=True)
        return ingest_configs[0]

    def launch_producer(self, stream_id=""):
        """
        Launch the producer
        """

        pid = self.container.spawn_process(
            "better_data_producer",
            "ion.processes.data.example_data_producer",
            "BetterDataProducer",
            {"process": {"stream_id": stream_id}},
        )

        self.pids.append(pid)

    def make_simple_dataset(self):
        """
        Makes a stream, a stream definition and a dataset, the essentials for most of these tests
        """
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name("ctd_parsed_param_dict", id_only=True)
        stream_def_id = self.pubsub_management.create_stream_definition("ctd data", parameter_dictionary_id=pdict_id)
        stream_id, route = self.pubsub_management.create_stream(
            "ctd stream %i" % self.i, "xp1", stream_definition_id=stream_def_id
        )

        dataset_id = self.create_dataset(pdict_id)

        self.get_datastore(dataset_id)
        self.i += 1
        return stream_id, route, stream_def_id, dataset_id

    def publish_hifi(self, stream_id, stream_route, offset=0):
        """
        Publish deterministic data
        """

        pub = StandaloneStreamPublisher(stream_id, stream_route)

        stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id)
        stream_def_id = stream_def._id
        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt["time"] = np.arange(10) + (offset * 10)
        rdt["temp"] = np.arange(10) + (offset * 10)
        pub.publish(rdt.to_granule())

    def publish_fake_data(self, stream_id, route):
        """
        Make four granules
        """
        for i in xrange(4):
            self.publish_hifi(stream_id, route, i)

    def start_ingestion(self, stream_id, dataset_id):
        """
        Starts ingestion/persistence for a given dataset
        """
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id
        )

    def stop_ingestion(self, stream_id):
        ingest_config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingest_config_id
        )

    def stop_all_ingestion(self):
        try:
            [self.stop_ingestion(sid) for sid in self.streams]
        except:
            pass

    def validate_granule_subscription(self, msg, route, stream_id):
        """
        Validation for granule format
        """
        if msg == {}:
            return
        rdt = RecordDictionaryTool.load_from_granule(msg)
        log.info("%s", rdt.pretty_print())
        self.assertIsInstance(msg, Granule, "Message is improperly formatted. (%s)" % type(msg))
        self.event.set()

    def wait_until_we_have_enough_granules(self, dataset_id="", data_size=40):
        """
        Loops until there is a sufficient amount of data in the dataset
        """
        done = False
        with gevent.Timeout(40):
            while not done:
                extents = self.dataset_management.dataset_extents(dataset_id, "time")[0]
                granule = self.data_retriever.retrieve_last_data_points(dataset_id, 1)
                rdt = RecordDictionaryTool.load_from_granule(granule)
                if rdt["time"] and rdt["time"][0] != rdt._pdict.get_context("time").fill_value and extents >= data_size:
                    done = True
                else:
                    gevent.sleep(0.2)

    # --------------------------------------------------------------------------------
    # Test Methods
    # --------------------------------------------------------------------------------

    @attr("SMOKE")
    def test_dm_end_2_end(self):
        # --------------------------------------------------------------------------------
        # Set up a stream and have a mock instrument (producer) send data
        # --------------------------------------------------------------------------------
        self.event.clear()

        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name("ctd_parsed_param_dict", id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext("binary", param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context("binary", bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext("records", param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context("records", rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            "replay_pdict", parameter_context_ids=context_ids, temporal_context="time"
        )

        stream_definition = self.pubsub_management.create_stream_definition(
            "ctd data", parameter_dictionary_id=pdict_id
        )

        stream_id, route = self.pubsub_management.create_stream(
            "producer", exchange_point=self.exchange_point_name, stream_definition_id=stream_definition
        )

        # --------------------------------------------------------------------------------
        # Start persisting the data on the stream
        # - Get the ingestion configuration from the resource registry
        # - Create the dataset
        # - call persist_data_stream to setup the subscription for the ingestion workers
        #   on the stream that you specify which causes the data to be persisted
        # --------------------------------------------------------------------------------

        ingest_config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=ingest_config_id, dataset_id=dataset_id
        )

        # --------------------------------------------------------------------------------
        # Now the granules are ingesting and persisted
        # --------------------------------------------------------------------------------

        self.launch_producer(stream_id)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        # --------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        # --------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_id)
        self.assertIsInstance(replay_data, Granule)
        rdt = RecordDictionaryTool.load_from_granule(replay_data)
        self.assertTrue((rdt["time"][:10] == np.arange(10)).all(), "%s" % rdt["time"][:])
        self.assertTrue((rdt["binary"][:10] == np.array(["hi"] * 10, dtype="object")).all())

        # --------------------------------------------------------------------------------
        # Now to try the streamed approach
        # --------------------------------------------------------------------------------
        replay_stream_id, replay_route = self.pubsub_management.create_stream(
            "replay_out", exchange_point=self.exchange_point_name, stream_definition_id=stream_definition
        )
        self.replay_id, process_id = self.data_retriever.define_replay(
            dataset_id=dataset_id, stream_id=replay_stream_id
        )
        log.info("Process ID: %s", process_id)

        replay_client = ReplayClient(process_id)

        # --------------------------------------------------------------------------------
        # Create the listening endpoint for the the retriever to talk to
        # --------------------------------------------------------------------------------
        xp = self.container.ex_manager.create_xp(self.exchange_point_name)
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        self.data_retriever.start_replay_agent(self.replay_id)

        self.assertTrue(replay_client.await_agent_ready(5), "The process never launched")
        replay_client.start_replay()

        self.assertTrue(self.event.wait(10))
        subscriber.stop()

        self.data_retriever.cancel_replay_agent(self.replay_id)

        # --------------------------------------------------------------------------------
        # Test the slicing capabilities
        # --------------------------------------------------------------------------------

        granule = self.data_retriever.retrieve(dataset_id=dataset_id, query={"tdoa": slice(0, 5)})
        rdt = RecordDictionaryTool.load_from_granule(granule)
        b = rdt["time"] == np.arange(5)
        self.assertTrue(b.all() if not isinstance(b, bool) else b)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    @unittest.skip("Doesnt work")
    @attr("LOCOINT")
    @unittest.skipIf(os.getenv("CEI_LAUNCH_TEST", False), "Skip test while in CEI LAUNCH mode")
    def test_replay_pause(self):
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name("ctd_parsed_param_dict", id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext("binary", param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context("binary", bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext("records", param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context("records", rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            "replay_pdict", parameter_context_ids=context_ids, temporal_context="time"
        )

        stream_def_id = self.pubsub_management.create_stream_definition(
            "replay_stream", parameter_dictionary_id=pdict_id
        )
        replay_stream, replay_route = self.pubsub_management.create_stream(
            "replay", "xp1", stream_definition_id=stream_def_id
        )
        dataset_id = self.create_dataset(pdict_id)
        scov = DatasetManagementService._get_coverage(dataset_id)

        bb = CoverageCraft(scov)
        bb.rdt["time"] = np.arange(100)
        bb.rdt["temp"] = np.random.random(100) + 30
        bb.sync_with_granule()

        DatasetManagementService._persist_coverage(
            dataset_id, bb.coverage
        )  # This invalidates it for multi-host configurations
        # Set up the subscriber to verify the data
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        xp = self.container.ex_manager.create_xp("xp1")
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        # Set up the replay agent and the client wrapper

        # 1) Define the Replay (dataset and stream to publish on)
        self.replay_id, process_id = self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream)
        # 2) Make a client to the interact with the process (optionall provide it a process to bind with)
        replay_client = ReplayClient(process_id)
        # 3) Start the agent (launch the process)
        self.data_retriever.start_replay_agent(self.replay_id)
        # 4) Start replaying...
        replay_client.start_replay()

        # Wait till we get some granules
        self.assertTrue(self.event.wait(5))

        # We got granules, pause the replay, clear the queue and allow the process to finish consuming
        replay_client.pause_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure there's no remaining messages being consumed
        self.assertFalse(self.event.wait(1))

        # Resume the replay and wait until we start getting granules again
        replay_client.resume_replay()
        self.assertTrue(self.event.wait(5))

        # Stop the replay, clear the queues
        replay_client.stop_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure that it did indeed stop
        self.assertFalse(self.event.wait(1))

        subscriber.stop()

    def test_retrieve_and_transform(self):
        # Make a simple dataset and start ingestion, pretty standard stuff.
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(ctd_stream_id, dataset_id)

        # Stream definition for the salinity data
        salinity_pdict_id = self.dataset_management.read_parameter_dictionary_by_name(
            "ctd_parsed_param_dict", id_only=True
        )
        sal_stream_def_id = self.pubsub_management.create_stream_definition(
            "sal data", parameter_dictionary_id=salinity_pdict_id
        )

        rdt = RecordDictionaryTool(stream_definition_id=stream_def_id)
        rdt["time"] = np.arange(10)
        rdt["temp"] = np.random.randn(10) * 10 + 30
        rdt["conductivity"] = np.random.randn(10) * 2 + 10
        rdt["pressure"] = np.random.randn(10) * 1 + 12

        publisher = StandaloneStreamPublisher(ctd_stream_id, route)
        publisher.publish(rdt.to_granule())

        rdt["time"] = np.arange(10, 20)

        publisher.publish(rdt.to_granule())

        self.wait_until_we_have_enough_granules(dataset_id, 20)

        granule = self.data_retriever.retrieve(
            dataset_id,
            None,
            None,
            "ion.processes.data.transforms.ctd.ctd_L2_salinity",
            "CTDL2SalinityTransformAlgorithm",
            kwargs=dict(params=sal_stream_def_id),
        )
        rdt = RecordDictionaryTool.load_from_granule(granule)
        for i in rdt["salinity"]:
            self.assertNotEquals(i, 0)
        self.streams.append(ctd_stream_id)
        self.stop_ingestion(ctd_stream_id)

    def test_last_granule(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)

        self.publish_hifi(stream_id, route, 0)
        self.publish_hifi(stream_id, route, 1)

        self.wait_until_we_have_enough_granules(dataset_id, 20)  # I just need two

        success = False

        def verifier():
            replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id, 10)

            rdt = RecordDictionaryTool.load_from_granule(replay_granule)

            comp = rdt["time"] == np.arange(10) + 10
            if not isinstance(comp, bool):
                return comp.all()
            return False

        success = poll(verifier)

        self.assertTrue(success)

        success = False

        def verify_points():
            replay_granule = self.data_retriever.retrieve_last_data_points(dataset_id, 5)

            rdt = RecordDictionaryTool.load_from_granule(replay_granule)

            comp = rdt["time"] == np.arange(15, 20)
            if not isinstance(comp, bool):
                return comp.all()
            return False

        success = poll(verify_points)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    def test_replay_with_parameters(self):
        # --------------------------------------------------------------------------------
        # Create the configurations and the dataset
        # --------------------------------------------------------------------------------
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name("ctd_parsed_param_dict", id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext("binary", param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context("binary", bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext("records", param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context("records", rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary(
            "replay_pdict", parameter_context_ids=context_ids, temporal_context="time"
        )

        stream_def_id = self.pubsub_management.create_stream_definition(
            "replay_stream", parameter_dictionary_id=pdict_id
        )

        stream_id, route = self.pubsub_management.create_stream(
            "replay_with_params", exchange_point=self.exchange_point_name, stream_definition_id=stream_def_id
        )
        config_id = self.get_ingestion_config()
        dataset_id = self.create_dataset(pdict_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )

        # --------------------------------------------------------------------------------
        # Coerce the datastore into existence (beats race condition)
        # --------------------------------------------------------------------------------
        self.get_datastore(dataset_id)

        self.launch_producer(stream_id)

        self.wait_until_we_have_enough_granules(dataset_id, 40)

        query = {
            "start_time": 0 - 2208988800,
            "end_time": 20 - 2208988800,
            "stride_time": 2,
            "parameters": ["time", "temp"],
        }
        retrieved_data = self.data_retriever.retrieve(dataset_id=dataset_id, query=query)

        rdt = RecordDictionaryTool.load_from_granule(retrieved_data)
        comp = np.arange(0, 20, 2) == rdt["time"]
        self.assertTrue(comp.all(), "%s" % rdt.pretty_print())
        self.assertEquals(set(rdt.iterkeys()), set(["time", "temp"]))

        extents = self.dataset_management.dataset_extents(dataset_id=dataset_id, parameters=["time", "temp"])
        self.assertTrue(extents["time"] >= 20)
        self.assertTrue(extents["temp"] >= 20)

        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    def test_repersist_data(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)
        self.publish_hifi(stream_id, route, 0)
        self.publish_hifi(stream_id, route, 1)
        self.wait_until_we_have_enough_granules(dataset_id, 20)
        config_id = self.get_ingestion_config()
        self.ingestion_management.unpersist_data_stream(stream_id=stream_id, ingestion_configuration_id=config_id)
        self.ingestion_management.persist_data_stream(
            stream_id=stream_id, ingestion_configuration_id=config_id, dataset_id=dataset_id
        )
        self.publish_hifi(stream_id, route, 2)
        self.publish_hifi(stream_id, route, 3)
        self.wait_until_we_have_enough_granules(dataset_id, 40)
        success = False
        with gevent.timeout.Timeout(5):
            while not success:

                replay_granule = self.data_retriever.retrieve(dataset_id)

                rdt = RecordDictionaryTool.load_from_granule(replay_granule)

                comp = rdt["time"] == np.arange(0, 40)
                if not isinstance(comp, bool):
                    success = comp.all()
                gevent.sleep(1)

        self.assertTrue(success)
        self.streams.append(stream_id)
        self.stop_ingestion(stream_id)

    @attr("LOCOINT")
    @unittest.skipIf(
        os.getenv("CEI_LAUNCH_TEST", False),
        "Host requires file-system access to coverage files, CEI mode does not support.",
    )
    def test_correct_time(self):

        # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e.
        #  the conversion factor between unix and NTP time
        unix_now = np.floor(time.time())
        ntp_now = unix_now + 2208988800

        unix_ago = unix_now - 20
        ntp_ago = unix_ago + 2208988800

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_coverage(dataset_id)
        coverage.insert_timesteps(20)
        coverage.set_parameter_values("time", np.arange(ntp_ago, ntp_now))

        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)

        self.assertTrue(np.abs(temporal_bounds[0] - unix_ago) < 2)
        self.assertTrue(np.abs(temporal_bounds[1] - unix_now) < 2)

    @attr("LOCOINT")
    @unittest.skipIf(
        os.getenv("CEI_LAUNCH_TEST", False),
        "Host requires file-system access to coverage files, CEI mode does not support.",
    )
    def test_empty_coverage_time(self):

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_coverage(dataset_id)
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)
        self.assertEquals([coverage.get_parameter_context("time").fill_value] * 2, temporal_bounds)

    @attr("LOCOINT")
    @unittest.skipIf(
        os.getenv("CEI_LAUNCH_TEST", False),
        "Host requires file-system access to coverage files, CEI mode does not support.",
    )
    def test_out_of_band_retrieve(self):
        # Setup the environemnt
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)

        # Fill the dataset
        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        # Retrieve the data
        granule = DataRetrieverService.retrieve_oob(dataset_id)
        rdt = RecordDictionaryTool.load_from_granule(granule)
        self.assertTrue((rdt["time"] == np.arange(40)).all())

    @attr("LOCOINT")
    @unittest.skipIf(
        os.getenv("CEI_LAUNCH_TEST", False),
        "Host requires file-system access to coverage files, CEI mode does not support.",
    )
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_coverage(dataset_id)
            coverage.insert_timesteps(10)
            coverage.set_parameter_values("time", np.arange(10))
            coverage.set_parameter_values("temp", np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)

        DataRetrieverService._refresh_interval = 100
        self.publish_hifi(stream_id, route, 1)
        self.wait_until_we_have_enough_granules(dataset_id, data_size=20)

        event = gevent.event.Event()
        with gevent.Timeout(20):
            while not event.wait(0.1):
                if dataset_id not in DataRetrieverService._retrieve_cache:
                    event.set()

        self.assertTrue(event.is_set())

    @unittest.skip("Outdated due to ingestion retry")
    @attr("LOCOINT")
    @unittest.skipIf(
        os.getenv("CEI_LAUNCH_TEST", False),
        "Host requires file-system access to coverage files, CEI mode does not support.",
    )
    def test_ingestion_failover(self):
        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        self.start_ingestion(stream_id, dataset_id)

        event = Event()

        def cb(*args, **kwargs):
            event.set()

        sub = EventSubscriber(event_type="ExceptionEvent", callback=cb, origin="stream_exception")
        sub.start()

        self.publish_fake_data(stream_id, route)
        self.wait_until_we_have_enough_granules(dataset_id, 40)

        file_path = DatasetManagementService._get_coverage_path(dataset_id)
        master_file = os.path.join(file_path, "%s_master.hdf5" % dataset_id)

        with open(master_file, "w") as f:
            f.write("this will crash HDF")

        self.publish_hifi(stream_id, route, 5)

        self.assertTrue(event.wait(10))

        sub.stop()
Пример #53
0
    def test_usgs_integration(self):
        '''
        test_usgs_integration
        Test full DM Services Integration using usgs
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_usgs_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        usgs_stream_def = USGS_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=usgs_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(2):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'ion.agents.eoi.handler.usgs_stream_publisher',
                'class':'UsgsPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
            # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
Пример #54
0
class TestDataProductManagementServiceIntegration(IonIntegrationTestCase):

    def setUp(self):
        # Start container
        #print 'instantiating container'
        self._start_container()

        self.container.start_rel_from_url('res/deploy/r2deploy.yml')

        self.dpsc_cli = DataProductManagementServiceClient(node=self.container.node)
        self.rrclient = ResourceRegistryServiceClient(node=self.container.node)
        self.damsclient = DataAcquisitionManagementServiceClient(node=self.container.node)
        self.pubsubcli =  PubsubManagementServiceClient(node=self.container.node)
        self.ingestclient = IngestionManagementServiceClient(node=self.container.node)
        self.process_dispatcher   = ProcessDispatcherServiceClient()
        self.dataset_management = DatasetManagementServiceClient()
        self.unsc = UserNotificationServiceClient()
        self.data_retriever = DataRetrieverServiceClient()

        #------------------------------------------
        # Create the environment
        #------------------------------------------

        datastore_name = CACHE_DATASTORE_NAME
        self.db = self.container.datastore_manager.get_datastore(datastore_name)
        self.stream_def_id = self.pubsubcli.create_stream_definition(name='SBE37_CDM')

        self.process_definitions  = {}
        ingestion_worker_definition = ProcessDefinition(name='ingestion worker')
        ingestion_worker_definition.executable = {
            'module':'ion.processes.data.ingestion.science_granule_ingestion_worker',
            'class' :'ScienceGranuleIngestionWorker'
        }
        process_definition_id = self.process_dispatcher.create_process_definition(process_definition=ingestion_worker_definition)
        self.process_definitions['ingestion_worker'] = process_definition_id

        self.pids = []
        self.exchange_points = []
        self.exchange_names = []

        #------------------------------------------------------------------------------------------------
        # First launch the ingestors
        #------------------------------------------------------------------------------------------------
        self.exchange_space       = 'science_granule_ingestion'
        self.exchange_point       = 'science_data'
        config = DotDict()
        config.process.datastore_name = 'datasets'
        config.process.queue_name = self.exchange_space

        self.exchange_names.append(self.exchange_space)
        self.exchange_points.append(self.exchange_point)

        pid = self.process_dispatcher.schedule_process(self.process_definitions['ingestion_worker'],configuration=config)
        log.debug("the ingestion worker process id: %s", pid)
        self.pids.append(pid)

        self.addCleanup(self.cleaning_up)

    def cleaning_up(self):
        for pid in self.pids:
            log.debug("number of pids to be terminated: %s", len(self.pids))
            try:
                self.process_dispatcher.cancel_process(pid)
                log.debug("Terminated the process: %s", pid)
            except:
                log.debug("could not terminate the process id: %s" % pid)
        IngestionManagementIntTest.clean_subscriptions()

        for xn in self.exchange_names:
            xni = self.container.ex_manager.create_xn_queue(xn)
            xni.delete()
        for xp in self.exchange_points:
            xpi = self.container.ex_manager.create_xp(xp)
            xpi.delete()

    def get_datastore(self, dataset_id):
        dataset = self.dataset_management.read_dataset(dataset_id)
        datastore_name = dataset.datastore_name
        datastore = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA)
        return datastore


    def test_create_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        parameter_dictionary_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict')
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=parameter_dictionary_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------

        # Generic time-series data domain creation
        tdom, sdom = time_series_domain()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom.dump(), 
            spatial_domain = sdom.dump())

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product( data_product= dp_obj,
                                            stream_definition_id=ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id)

        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product with  a stream definition
        #------------------------------------------------------------------------------------------------
        log.debug('Creating new data product with a stream definition')
        dp_obj = IonObject(RT.DataProduct,
            name='DP2',
            description='some new dp',
            temporal_domain = tdom.dump(),
            spatial_domain = sdom.dump())

        dp_id2 = self.dpsc_cli.create_data_product(dp_obj, ctd_stream_def_id)
        self.dpsc_cli.activate_data_product_persistence(dp_id2)
        log.debug('new dp_id = %s' % dp_id2)

        #------------------------------------------------------------------------------------------------
        #make sure data product is associated with stream def
        #------------------------------------------------------------------------------------------------
        streamdefs = []
        streams, _ = self.rrclient.find_objects(dp_id2, PRED.hasStream, RT.Stream, True)
        for s in streams:
            log.debug("Checking stream %s" % s)
            sdefs, _ = self.rrclient.find_objects(s, PRED.hasStreamDefinition, RT.StreamDefinition, True)
            for sd in sdefs:
                log.debug("Checking streamdef %s" % sd)
                streamdefs.append(sd)
        self.assertIn(ctd_stream_def_id, streamdefs)


        # test reading a non-existent data product
        log.debug('reading non-existent data product')

        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product('some_fake_id')

        # update a data product (tests read also)
        log.debug('Updating data product')
        # first get the existing dp object
        dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # now tweak the object
        dp_obj.description = 'the very first dp'
        # now write the dp back to the registry
        update_result = self.dpsc_cli.update_data_product(dp_obj)

        # now get the dp back to see if it was updated
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertEquals(dp_obj.description,'the very first dp')

        #test extension
        extended_product = self.dpsc_cli.get_data_product_extension(dp_id)
        self.assertEqual(dp_id, extended_product._id)
        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.product_download_size_estimated.status)
        self.assertEqual(0, extended_product.computed.product_download_size_estimated.value)

        self.assertEqual(ComputedValueAvailability.PROVIDED,
                         extended_product.computed.parameters.status)
        #log.debug("test_create_data_product: parameters %s" % extended_product.computed.parameters.value)

        # now 'delete' the data product
        log.debug("deleting data product: %s" % dp_id)
        self.dpsc_cli.delete_data_product(dp_id)
        self.dpsc_cli.force_delete_data_product(dp_id)

        # now try to get the deleted dp object
        with self.assertRaises(NotFound):
            dp_obj = self.dpsc_cli.read_data_product(dp_id)

        # Get the events corresponding to the data product
        ret = self.unsc.get_recent_events(resource_id=dp_id)
        events = ret.value

        for event in events:
            log.debug("event time: %s" % event.ts_created)

#        self.assertTrue(len(events) > 0)

    def test_data_product_stream_def(self):
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)

        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)
        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        stream_def_id = self.dpsc_cli.get_data_product_stream_definition(dp_id)
        self.assertEquals(ctd_stream_def_id, stream_def_id)



    def test_activate_suspend_data_product(self):

        #------------------------------------------------------------------------------------------------
        # create a stream definition for the data from the ctd simulator
        #------------------------------------------------------------------------------------------------
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict', id_only=True)
        ctd_stream_def_id = self.pubsubcli.create_stream_definition(name='Simulated CTD data', parameter_dictionary_id=pdict_id)
        log.debug("Created stream def id %s" % ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test creating a new data product w/o a stream definition
        #------------------------------------------------------------------------------------------------
        # Construct temporal and spatial Coordinate Reference System objects
        tdom, sdom = time_series_domain()

        sdom = sdom.dump()
        tdom = tdom.dump()



        dp_obj = IonObject(RT.DataProduct,
            name='DP1',
            description='some new dp',
            temporal_domain = tdom,
            spatial_domain = sdom)

        log.debug("Created an IonObject for a data product: %s" % dp_obj)

        #------------------------------------------------------------------------------------------------
        # Create a set of ParameterContext objects to define the parameters in the coverage, add each to the ParameterDictionary
        #------------------------------------------------------------------------------------------------

        dp_id = self.dpsc_cli.create_data_product(data_product= dp_obj,
            stream_definition_id=ctd_stream_def_id)

        #------------------------------------------------------------------------------------------------
        # test activate and suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.activate_data_product_persistence(dp_id)
        
        dp_obj = self.dpsc_cli.read_data_product(dp_id)
        self.assertIsNotNone(dp_obj)

        dataset_ids, _ = self.rrclient.find_objects(subject=dp_id, predicate=PRED.hasDataset, id_only=True)
        if not dataset_ids:
            raise NotFound("Data Product %s dataset  does not exist" % str(dp_id))
        self.get_datastore(dataset_ids[0])


        # Check that the streams associated with the data product are persisted with
        stream_ids, _ =  self.rrclient.find_objects(dp_id,PRED.hasStream,RT.Stream,True)
        for stream_id in stream_ids:
            self.assertTrue(self.ingestclient.is_persisted(stream_id))

        #--------------------------------------------------------------------------------
        # Now get the data in one chunk using an RPC Call to start_retreive
        #--------------------------------------------------------------------------------

        replay_data = self.data_retriever.retrieve(dataset_ids[0])
        self.assertIsInstance(replay_data, Granule)

        log.debug("The data retriever was able to replay the dataset that was attached to the data product "
                  "we wanted to be persisted. Therefore the data product was indeed persisted with "
                  "otherwise we could not have retrieved its dataset using the data retriever. Therefore "
                  "this demonstration shows that L4-CI-SA-RQ-267 is satisfied: 'Data product management shall persist data products'")

        data_product_object = self.rrclient.read(dp_id)
        self.assertEquals(data_product_object.name,'DP1')
        self.assertEquals(data_product_object.description,'some new dp')

        log.debug("Towards L4-CI-SA-RQ-308: 'Data product management shall persist data product metadata'. "
                  " Attributes in create for the data product obj, name= '%s', description='%s', match those of object from the "
                  "resource registry, name='%s', desc='%s'" % (dp_obj.name, dp_obj.description,data_product_object.name,
                                                           data_product_object.description))

        #------------------------------------------------------------------------------------------------
        # test suspend data product persistence
        #------------------------------------------------------------------------------------------------
        self.dpsc_cli.suspend_data_product_persistence(dp_id)

        self.dpsc_cli.force_delete_data_product(dp_id)
        # now try to get the deleted dp object

        with self.assertRaises(NotFound):
            dp_obj = self.rrclient.read(dp_id)