예제 #1
0
    def get_editable_coverage(self, dataset_id):
        sid = self.get_stream_id(dataset_id)

        # Check if we already have the coverage
        if sid in self._paused_streams:
            cov = self._w_covs[sid]
            # If it's not closed, return it
            if not cov.closed:
                return cov
            # Otherwise, remove it from self._ro_covs and carry on
            del self._w_covs[sid]

        self.pause_ingestion(sid)
        if not self._context_managed:
            warn_user(
                'Warning: Coverages will remain open until they are closed or go out of scope - '
                'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)'
            )
        try:
            self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(
                dataset_id, mode='w')
            return self._w_covs[sid]
        except:
            self.resume_ingestion(sid)
            raise
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
            coverage.insert_timesteps(10)
            coverage.set_parameter_values('time', np.arange(10))
            coverage.set_parameter_values('temp', np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)
    def test_coverage_recovery(self):
        # Create the coverage
        dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product()
        self.populate_dataset(dataset_id, 36)
        dset = self.dataset_management.read_dataset(dataset_id)
        dprod = self.dpsc_cli.read_data_product(dp_id)
        cov = DatasetManagementService._get_simplex_coverage(dataset_id)
        cov_pth = cov.persistence_dir
        cov.close()

        # Analyze the valid coverage
        dr = CoverageDoctor(cov_pth, dprod, dset)
        dr_result = dr.analyze()

        # Get original values (mock)
        orig_cov = AbstractCoverage.load(cov_pth)
        time_vals_orig = orig_cov.get_time_values()

        # TODO: Destroy the metadata files

        # TODO: RE-analyze coverage

        # TODO: Should be corrupt, take action to repair if so

        # Repair the metadata files
        dr.repair_metadata()

        # TODO: Re-analyze fixed coverage

        fixed_cov = AbstractCoverage.load(cov_pth)
        self.assertIsInstance(fixed_cov, AbstractCoverage)

        time_vals_fixed = fixed_cov.get_time_values()
        self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
    def test_coverage_recovery(self):
        # Create the coverage
        dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product()
        self.populate_dataset(dataset_id, 36)
        dset = self.dataset_management.read_dataset(dataset_id)
        dprod = self.dpsc_cli.read_data_product(dp_id)
        cov = DatasetManagementService._get_simplex_coverage(dataset_id)
        cov_pth = cov.persistence_dir
        cov.close()

        # Analyze the valid coverage
        dr = CoverageDoctor(cov_pth, dprod, dset)

        dr_result = dr.analyze()

        # TODO: Turn these into meaningful Asserts
        self.assertEqual(len(dr_result.get_brick_corruptions()), 0)
        self.assertEqual(len(dr_result.get_brick_size_ratios()), 8)
        self.assertEqual(len(dr_result.get_corruptions()), 0)
        self.assertEqual(len(dr_result.get_master_corruption()), 0)
        self.assertEqual(len(dr_result.get_param_corruptions()), 0)
        self.assertEqual(len(dr_result.get_param_size_ratios()), 64)
        self.assertEqual(len(dr_result.get_master_size_ratio()), 1)
        self.assertEqual(len(dr_result.get_size_ratios()), 73)
        self.assertEqual(dr_result.master_status[1], 'NORMAL')

        self.assertFalse(dr_result.is_corrupt)
        self.assertEqual(dr_result.param_file_count, 64)
        self.assertEqual(dr_result.brick_file_count, 8)
        self.assertEqual(dr_result.total_file_count, 73)

        # Get original values (mock)
        orig_cov = AbstractCoverage.load(cov_pth)
        time_vals_orig = orig_cov.get_time_values()
        orig_cov.close()

        # Corrupt the Master File
        fo = open(cov._persistence_layer.master_manager.file_path, "wb")
        fo.write('Junk')
        fo.close()
        # Corrupt the lon Parameter file
        fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb")
        fo.write('Junk')
        fo.close()

        corrupt_res = dr.analyze(reanalyze=True)
        self.assertTrue(corrupt_res.is_corrupt)

        # Repair the metadata files
        dr.repair(reanalyze=True)

        fixed_res = dr.analyze(reanalyze=True)
        self.assertFalse(fixed_res.is_corrupt)

        fixed_cov = AbstractCoverage.load(cov_pth)
        self.assertIsInstance(fixed_cov, AbstractCoverage)

        time_vals_fixed = fixed_cov.get_time_values()
        fixed_cov.close()
        self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
    def test_coverage_types(self):
        # Make a simple dataset and start ingestion, pretty standard stuff.
        ctd_stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        cov = DatasetManagementService._get_coverage(dataset_id=dataset_id)
        self.assertIsInstance(cov, ViewCoverage)

        cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id)
        self.assertIsInstance(cov, SimplexCoverage)
예제 #6
0
    def get_editable_coverage(self, dataset_id):
        sid = self.get_stream_id(dataset_id)
        if sid in self._paused_streams:
            return self._w_covs[sid]

        self.pause_ingestion(sid)
        self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w')
        return self._w_covs[sid]
예제 #7
0
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_simplex_coverage(
                dataset_id)
            coverage.insert_timesteps(10)
            coverage.set_parameter_values('time', np.arange(10))
            coverage.set_parameter_values('temp', np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(
            dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0])  # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(
            dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)

        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)

        DataRetrieverService._refresh_interval = 100
        self.publish_hifi(stream_id, route, 1)
        self.wait_until_we_have_enough_granules(dataset_id, data_size=20)

        event = gevent.event.Event()
        with gevent.Timeout(20):
            while not event.wait(0.1):
                if dataset_id not in DataRetrieverService._retrieve_cache:
                    event.set()

        self.assertTrue(event.is_set())
    def populate_dataset(self, dataset_id, hours):
        import time
        cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id, mode='w')
        # rcov = vcov.reference_coverage
        # cov = AbstractCoverage.load(rcov.persistence_dir, mode='a')
        dt = hours * 3600

        cov.insert_timesteps(dt)
        now = time.time()
        cov.set_parameter_values('time', np.arange(now - dt, now) + 2208988800)
        cov.set_parameter_values('temp', np.sin(np.arange(dt) * 2 * np.pi / 60))
        cov.set_parameter_values('lat', np.zeros(dt))
        cov.set_parameter_values('lon', np.zeros(dt))

        cov.close()
        gevent.sleep(1)
예제 #9
0
    def populate_dataset(self, dataset_id, hours):
        import time
        cov = DatasetManagementService._get_simplex_coverage(dataset_id=dataset_id, mode='w')
        # rcov = vcov.reference_coverage
        # cov = AbstractCoverage.load(rcov.persistence_dir, mode='a')
        dt = hours * 3600

        cov.insert_timesteps(dt)
        now = time.time()
        cov.set_parameter_values('time', np.arange(now - dt, now) + 2208988800)
        cov.set_parameter_values('temp', np.sin(np.arange(dt) * 2 * np.pi / 60))
        cov.set_parameter_values('lat', np.zeros(dt))
        cov.set_parameter_values('lon', np.zeros(dt))

        cov.close()
        gevent.sleep(1)
예제 #10
0
    def test_retrieve_cache(self):
        DataRetrieverService._refresh_interval = 1
        datasets = [self.make_simple_dataset() for i in xrange(10)]
        for stream_id, route, stream_def_id, dataset_id in datasets:
            coverage = DatasetManagementService._get_simplex_coverage(dataset_id)
            coverage.insert_timesteps(10)
            coverage.set_parameter_values('time', np.arange(10))
            coverage.set_parameter_values('temp', np.arange(10))

        # Verify cache hit and refresh
        dataset_ids = [i[3] for i in datasets]
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)
        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        # Verify that it was hit and it's now in there
        self.assertTrue(dataset_ids[0] in DataRetrieverService._retrieve_cache)

        gevent.sleep(DataRetrieverService._refresh_interval + 0.2)

        DataRetrieverService._get_coverage(dataset_ids[0]) # Hit the chache
        cov, age2 = DataRetrieverService._retrieve_cache[dataset_ids[0]]
        self.assertTrue(age2 != age)

        for dataset_id in dataset_ids:
            DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_ids[0] not in DataRetrieverService._retrieve_cache)

        stream_id, route, stream_def, dataset_id = datasets[0]
        self.start_ingestion(stream_id, dataset_id)
        DataRetrieverService._get_coverage(dataset_id)
        
        self.assertTrue(dataset_id in DataRetrieverService._retrieve_cache)

        DataRetrieverService._refresh_interval = 100
        self.publish_hifi(stream_id,route,1)
        self.wait_until_we_have_enough_granules(dataset_id, data_size=20)
            
 
        event = gevent.event.Event()
        with gevent.Timeout(20):
            while not event.wait(0.1):
                if dataset_id not in DataRetrieverService._retrieve_cache:
                    event.set()


        self.assertTrue(event.is_set())
 def get_coverage(self, stream_id):
     '''
     Memoization (LRU) of _get_coverage
     '''
     try:
         result = self._coverages.pop(stream_id)
     except KeyError:
         dataset_id = self.get_dataset(stream_id)
         if dataset_id is None:
             return None
         result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
         if result is None:
             return None
         if len(self._coverages) >= self.CACHE_LIMIT:
             k, coverage = self._coverages.popitem(0)
             coverage.close(timeout=5)
     self._coverages[stream_id] = result
     return result
예제 #12
0
 def get_coverage(self, stream_id):
     '''
     Memoization (LRU) of _get_coverage
     '''
     try:
         result = self._coverages.pop(stream_id)
     except KeyError:
         dataset_id = self.get_dataset(stream_id)
         if dataset_id is None:
             return None
         result = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
         if result is None:
             return None
         if len(self._coverages) >= self.CACHE_LIMIT:
             k, coverage = self._coverages.popitem(0)
             coverage.close(timeout=5)
     self._coverages[stream_id] = result
     return result
예제 #13
0
    def test_correct_time(self):

        # There are 2208988800 seconds between Jan 1 1900 and Jan 1 1970, i.e. 
        #  the conversion factor between unix and NTP time
        unix_now = np.floor(time.time())
        ntp_now  = unix_now + 2208988800 

        unix_ago = unix_now - 20
        ntp_ago  = unix_ago + 2208988800

        stream_id, route, stream_def_id, dataset_id = self.make_simple_dataset()
        coverage = DatasetManagementService._get_simplex_coverage(dataset_id, mode='a')
        coverage.insert_timesteps(20)
        coverage.set_parameter_values('time', np.arange(ntp_ago,ntp_now))
        
        temporal_bounds = self.dataset_management.dataset_temporal_bounds(dataset_id)

        self.assertTrue( np.abs(temporal_bounds[0] - unix_ago) < 2)
        self.assertTrue( np.abs(temporal_bounds[1] - unix_now) < 2)
    def get_editable_coverage(self, dataset_id):
        sid = self.get_stream_id(dataset_id)

        # Check if we already have the coverage
        if sid in self._paused_streams:
            cov = self._w_covs[sid]
            # If it's not closed, return it
            if not cov.closed:
                return cov
            # Otherwise, remove it from self._ro_covs and carry on
            del self._w_covs[sid]

        self.pause_ingestion(sid)
        if not self._context_managed:
            warn_user('Warning: Coverages will remain open until they are closed or go out of scope - '
                           'be sure to close coverage instances when you are finished working with them or call self.clean_up(w_covs=True)')
        try:
            self._w_covs[sid] = DatasetManagementService._get_simplex_coverage(dataset_id, mode='w')
            return self._w_covs[sid]
        except:
            self.resume_ingestion(sid)
            raise
예제 #15
0
    def test_replay_pause(self):
        # Get a precompiled parameter dictionary with basic ctd fields
        pdict_id = self.dataset_management.read_parameter_dictionary_by_name('ctd_parsed_param_dict',id_only=True)
        context_ids = self.dataset_management.read_parameter_contexts(pdict_id, id_only=True)

        # Add a field that supports binary data input.
        bin_context = ParameterContext('binary',  param_type=ArrayType())
        context_ids.append(self.dataset_management.create_parameter_context('binary', bin_context.dump()))
        # Add another field that supports dictionary elements.
        rec_context = ParameterContext('records', param_type=RecordType())
        context_ids.append(self.dataset_management.create_parameter_context('records', rec_context.dump()))

        pdict_id = self.dataset_management.create_parameter_dictionary('replay_pdict', parameter_context_ids=context_ids, temporal_context='time')
        

        stream_def_id = self.pubsub_management.create_stream_definition('replay_stream', parameter_dictionary_id=pdict_id)
        replay_stream, replay_route = self.pubsub_management.create_stream('replay', 'xp1', stream_definition_id=stream_def_id)
        dataset_id = self.create_dataset(pdict_id)
        scov = DatasetManagementService._get_simplex_coverage(dataset_id)

        bb = CoverageCraft(scov)
        bb.rdt['time'] = np.arange(100)
        bb.rdt['temp'] = np.random.random(100) + 30
        bb.sync_with_granule()

        DatasetManagementService._persist_coverage(dataset_id, bb.coverage) # This invalidates it for multi-host configurations
        # Set up the subscriber to verify the data
        subscriber = StandaloneStreamSubscriber(self.exchange_space_name, self.validate_granule_subscription)
        xp = self.container.ex_manager.create_xp('xp1')
        self.queue_buffer.append(self.exchange_space_name)
        subscriber.start()
        subscriber.xn.bind(replay_route.routing_key, xp)

        # Set up the replay agent and the client wrapper

        # 1) Define the Replay (dataset and stream to publish on)
        self.replay_id, process_id = self.data_retriever.define_replay(dataset_id=dataset_id, stream_id=replay_stream)
        # 2) Make a client to the interact with the process (optionall provide it a process to bind with)
        replay_client = ReplayClient(process_id)
        # 3) Start the agent (launch the process)
        self.data_retriever.start_replay_agent(self.replay_id)
        # 4) Start replaying...
        replay_client.start_replay()
        
        # Wait till we get some granules
        self.assertTrue(self.event.wait(5))
        
        # We got granules, pause the replay, clear the queue and allow the process to finish consuming
        replay_client.pause_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()
        
        # Make sure there's no remaining messages being consumed
        self.assertFalse(self.event.wait(1))

        # Resume the replay and wait until we start getting granules again
        replay_client.resume_replay()
        self.assertTrue(self.event.wait(5))
    
        # Stop the replay, clear the queues
        replay_client.stop_replay()
        gevent.sleep(1)
        subscriber.xn.purge()
        self.event.clear()

        # Make sure that it did indeed stop
        self.assertFalse(self.event.wait(1))

        subscriber.stop()
예제 #16
0
    def test_coverage_recovery(self):
        # Create the coverage
        dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product(
        )
        self.populate_dataset(dataset_id, 36)
        dset = self.dataset_management.read_dataset(dataset_id)
        dprod = self.dpsc_cli.read_data_product(dp_id)
        cov = DatasetManagementService._get_simplex_coverage(dataset_id)
        cov_pth = cov.persistence_dir
        cov.close()

        num_params = len(cov.list_parameters())
        num_bricks = 8
        total = num_params + num_bricks + 1

        # Analyze the valid coverage
        dr = CoverageDoctor(cov_pth, dprod, dset)

        dr_result = dr.analyze()

        # TODO: Turn these into meaningful Asserts
        self.assertEqual(len(dr_result.get_brick_corruptions()), 0)
        self.assertEqual(len(dr_result.get_brick_size_ratios()), num_bricks)
        self.assertEqual(len(dr_result.get_corruptions()), 0)
        self.assertEqual(len(dr_result.get_master_corruption()), 0)
        self.assertEqual(len(dr_result.get_param_corruptions()), 0)
        self.assertEqual(len(dr_result.get_param_size_ratios()), num_params)
        self.assertEqual(len(dr_result.get_master_size_ratio()), 1)
        self.assertEqual(len(dr_result.get_size_ratios()), total)
        self.assertEqual(dr_result.master_status[1], 'NORMAL')

        self.assertFalse(dr_result.is_corrupt)
        self.assertEqual(dr_result.param_file_count, num_params)
        self.assertEqual(dr_result.brick_file_count, num_bricks)
        self.assertEqual(dr_result.total_file_count, total)

        # Get original values (mock)
        orig_cov = AbstractCoverage.load(cov_pth)
        time_vals_orig = orig_cov.get_time_values()
        orig_cov.close()

        # Corrupt the Master File
        fo = open(cov._persistence_layer.master_manager.file_path, "wb")
        fo.write('Junk')
        fo.close()
        # Corrupt the lon Parameter file
        fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path,
                  "wb")
        fo.write('Junk')
        fo.close()

        corrupt_res = dr.analyze(reanalyze=True)
        self.assertTrue(corrupt_res.is_corrupt)

        # Repair the metadata files
        dr.repair(reanalyze=True)

        fixed_res = dr.analyze(reanalyze=True)
        self.assertFalse(fixed_res.is_corrupt)

        fixed_cov = AbstractCoverage.load(cov_pth)
        self.assertIsInstance(fixed_cov, AbstractCoverage)

        time_vals_fixed = fixed_cov.get_time_values()
        fixed_cov.close()
        self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))