Пример #1
0
    def tearDown(self):
        """
        Cleanup. Delete Subscription, Stream, Process Definition
        """

        for fname in self.fnames:
            FileSystem.unlink(fname)
Пример #2
0
    def tearDown(self):
        """
        Cleanup. Delete Subscription, Stream, Process Definition
        """

        for fname in self.fnames:
            FileSystem.unlink(fname)
Пример #3
0
        def create_known(dataset_name, rootgrp_name, grp_name):
            """
            A known array to compare against during tests
            """

            known_array = numpy.ones((10,20))

            filename = FileSystem.get_url(FS.TEMP,random_name(), ".hdf5")

            # Write an hdf file with known values to compare against
            h5pyfile = h5py.File(filename, mode = 'w', driver='core')
            grp = h5pyfile.create_group(rootgrp_name)
            subgrp = grp.create_group(grp_name)
            dataset = subgrp.create_dataset(dataset_name, known_array.shape, known_array.dtype.str, maxshape=(None,None))
            dataset.write_direct(known_array)
            h5pyfile.close()

            # convert the hdf file into a binary string
            f = open(filename, mode='rb')
            # read the binary string representation of the file
            known_hdf_as_string = f.read() # this is a known string to compare against during tests
            f.close()
            # cleaning up
            FileSystem.unlink(f.name)

            return known_array, known_hdf_as_string
Пример #4
0
        def create_known(dataset_name, rootgrp_name, grp_name):
            """
            A known array to compare against during tests
            """

            known_array = numpy.random.rand(10, 20)

            filename = FileSystem.get_url(FS.TEMP, random_name(), ".hdf5")

            # Write an hdf file with known values to compare against
            h5pyfile = h5py.File(filename, mode='w', driver='core')
            grp = h5pyfile.create_group(rootgrp_name)
            subgrp = grp.create_group(grp_name)
            dataset = subgrp.create_dataset(dataset_name,
                                            known_array.shape,
                                            known_array.dtype.str,
                                            compression='gzip',
                                            compression_opts=4,
                                            maxshape=(None, None))

            dataset.write_direct(known_array)
            h5pyfile.close()

            # convert the hdf file into a binary string
            f = open(filename, mode='rb')
            # read the binary string representation of the file
            known_hdf_as_string = f.read(
            )  # this is a known string to compare against during tests
            f.close()
            # cleaning up
            FileSystem.unlink(f.name)

            return known_array, known_hdf_as_string
        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)
Пример #6
0
    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[
            self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[
            self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[
            time_field].values_path or self.definition.identifiables[
                time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------

        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i == 0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i + 1) < len(time_vector):  # not last val
                if time_vector[i] < timeval and time_vector[i + 1] > timeval:
                    retval = i
                    break
            else:  # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval
Пример #7
0
    def _slice(self, granule, slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition, granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0]
                          for i in pairs])  # Get the var_names from the pairs
        log.debug('var_names: %s', var_names)
        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path], var_names, record_count,
                               slice_).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field, path in fields.iteritems():
                if vp == path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------

        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval
Пример #8
0
    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------


        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i==0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i+1) < len(time_vector): # not last val
                if time_vector[i] < timeval and time_vector[i+1] > timeval:
                    retval = i
                    break
            else: # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval
Пример #9
0
    def _slice(self,granule,slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition,granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs
        log.debug('var_names: %s',var_names)
        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path],var_names,record_count,slice_ ).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field,path in fields.iteritems():
                if vp==path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------


        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)
Пример #11
0
    def hdf_to_string(self):
        """
        Convert the temporary hdf file holding the data into a binary string. Cleanup by deleting the hdf file and
        return the binary string.

        @retval hdf_string
        """
        # Return Value
        # ------------
        # hdf_string: ''
        #
        try:
            # open the hdf5 file using python 'open()'
            f = open(self.filename, mode='rb')
            # read the binary string representation of the file
            hdf_string = f.read()
            f.close()
        except IOError:
            log.exception("Error opening binary file for reading out hdfstring in HDFEncoder. ")
            raise HDFEncoderException("Error while trying to open file. ")
        finally:
            FileSystem.unlink(self.filename)
        return hdf_string
Пример #12
0
    def __del__(self):
        # This is dangerous - I don't like implementing del!!!

        # Clean up files!
        FileSystem.unlink(self.filename)
Пример #13
0
    def test_dm_integration(self):
        '''
        test_dm_integration
        Test full DM Services Integration
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_dm_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        ctd_stream_def = ctd_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(5):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'ion.processes.data.ctd_stream_publisher',
                'class':'SimpleCtdPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    'datastore_name':datastore_name
                }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
        # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
        assertions('latitude_bounds' in bounds, 'dataset_id: %s' % dataset_id)
        assertions('longitude_bounds' in bounds)
        assertions('pressure_bounds' in bounds)

        #--------------------------------------------
        # Make sure the transform capture worked
        #--------------------------------------------

        time.sleep(3) # Give the other processes up to 3 seconds to catch up


        stats = os.stat(FileSystem.get_url(FS.TEMP,'transform_output'))
        assertions(stats.st_blksize > 0)

        # BEAUTIFUL!

        FileSystem.unlink(FileSystem.get_url(FS.TEMP,'transform_output'))
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP,'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1
        )

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id
        )

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition
        )
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id
        )

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id'
        )
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id = ingestion_configuration_id
        )
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields=[
            'conductivity',
            'height',
            'latitude',
            'longitude',
            'pressure',
            'temperature',
            'time'
        ]

        input_vectors = acquire_data([input_file_path],fields , 2).next()

        producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id)
        ar = gevent.event.AsyncResult()
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
Пример #15
0
    def subset(self, granule, coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id

        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages,
                  type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------

        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id],
                          CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[
                        range_id].values_path or self.definition.identifiables[
                            range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[
                        range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[
                                range_id].values_path or self.definition.identifiables[
                                    range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[
                                range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id

                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],
                  values_path, granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row, value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)

        FileSystem.unlink(file_path)

        return granule
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP, 'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1)

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[
            data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition)
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id)

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id')
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=ingestion_configuration_id)
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields = [
            'conductivity', 'height', 'latitude', 'longitude', 'pressure',
            'temperature', 'time'
        ]

        input_vectors = acquire_data([input_file_path], fields, 2).next()

        producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(
            dataset_id)
        ar = gevent.event.AsyncResult()

        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP, 'replay listener'),
                                callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen,
                            binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
Пример #17
0
    def subset(self,granule,coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id


        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages, type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------


        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------


            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if  field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id


                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row,value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)

        FileSystem.unlink(file_path)

        return granule