Пример #1
0
class IngestionWorker(TransformDataProcess):
    """
    Instances of this class acts as Ingestion Workers. They receive packets and send them to couchdb datastore or
    hdf storage according to the policy in the data stream or the default policy of the ingestion configuration
    """


    def dataset_configs_event_test_hook(self, msg, headers):
        pass

    def ingest_process_test_hook(self,msg, headers):
        pass

    def on_init(self):
        self.event_pub = EventPublisher()

    def on_start(self):
        super(IngestionWorker,self).on_start()
        #----------------------------------------------
        # Start up couch
        #----------------------------------------------


        self.couch_config = self.CFG.get('couch_storage')
        self.hdf_storage = self.CFG.get('hdf_storage')

        self.number_of_workers = self.CFG.get('number_of_workers')
        self.description = self.CFG.get('description')

        self.ingest_config_id = self.CFG.get('configuration_id')

        self.datastore_name = self.couch_config.get('datastore_name',None) or 'dm_datastore'
        try:
            self.datastore_profile = getattr(DataStore.DS_PROFILE, self.couch_config.get('datastore_profile','SCIDATA'))
        except AttributeError:
            log.exception('Invalid datastore profile passed to ingestion worker. Defaulting to SCIDATA')

            self.datastore_profile = DataStore.DS_PROFILE.SCIDATA
        log.debug('datastore_profile %s' % self.datastore_profile)
        self.db = self.container.datastore_manager.get_datastore(ds_name=self.datastore_name, profile = self.datastore_profile, config = self.CFG)

        self.resource_reg_client = ResourceRegistryServiceClient(node = self.container.node)

        self.dataset_configs = {}
        # update the policy
        def receive_dataset_config_event(event_msg, headers):
            log.info('Updating dataset config in ingestion worker: %s', event_msg)

            if event_msg.type != DatasetIngestionTypeEnum.DATASETINGESTIONBYSTREAM:
                raise IngestionWorkerException('Received invalid type in dataset config event.')

            stream_id = event_msg.configuration.stream_id

            if event_msg.deleted:
                try:
                    del self.dataset_configs[stream_id]
                except KeyError:
                    log.info('Tried to remove dataset config that does not exist!')
            else:
                self.dataset_configs[stream_id] = event_msg

            # Hook to override just before processing is complete
            self.dataset_configs_event_test_hook(event_msg, headers)


        #Start the event subscriber - really - what a mess!
        self.event_subscriber = EventSubscriber(
            event_type="DatasetIngestionConfigurationEvent",
            origin=self.ingest_config_id,
            callback=receive_dataset_config_event
            )

        self.gl = spawn(self.event_subscriber.listen)
        self.event_subscriber._ready_event.wait(timeout=5)

        log.info(str(self.db))

    def process(self, packet):
        """Process incoming data!!!!
        """

        # Ignoring any packet that is not a stream granule!
        if not isinstance(packet, StreamGranuleContainer):
            raise IngestionWorkerException('Received invalid message type: "%s"', type(packet))


        # Get the dataset config for this stream
        dset_config = self.get_dataset_config(packet)

        # Process the packet

        ingest_attributes = self.process_stream(packet, dset_config)


        #@todo - get this data from the dataset config...
        if dset_config:
            dataset_id = dset_config.dataset_id
            stream_id = dset_config.stream_id

            self.event_pub.publish_event(event_type="GranuleIngestedEvent", sub_type="DatasetIngest",
                origin=dataset_id, status=200,
                ingest_attributes=ingest_attributes, stream_id=stream_id)


            headers = ''
            # Hook to override just before processing is complete
            self.ingest_process_test_hook(packet, headers)


    def persist_immutable(self, obj):
        """
        This method is not functional yet - the doc object is python specific. The sha1 must be of a language independent form.
        """
        doc = self.db._ion_object_to_persistence_dict(obj)
        sha1 = sha1hex(doc)

        try:
            self.db.create_doc(doc, object_id=sha1)
            log.debug('Persisted document %s', type(obj))
        except BadRequest:
            # Deduplication in action!
            #@TODO why are we getting so many duplicate comments?
            log.exception('Failed to write packet!\n%s' % obj)

        # Do the id or revision have a purpose? do we need a return value?


    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """


        ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False}

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return

        values_string = ''
        sha1 = ''
        encoding_type = ''

        for key,value in packet.identifiables.iteritems():
            if isinstance(value, DataStream):
                values_string = value.values
                value.values=''

            elif isinstance(value, Encoding):
                sha1 = value.sha1
                encoding_type = value.encoding_type

            elif isinstance(value, Coverage):
                ingestion_attributes['variables'].append(key)

            elif isinstance(value, CountElement):
                ingestion_attributes['number_of_records'] = value.value

        if dset_config.archive_metadata is True:
            log.debug("Persisting data....")
            ingestion_attributes['updated_metadata'] = True
            self.persist_immutable(packet )

        if dset_config.archive_data is True:
            #@todo - grab the filepath to save the hdf string somewhere..

            ingestion_attributes['updated_data'] = True
            if values_string:

                calculated_sha1 = hashlib.sha1(values_string).hexdigest().upper()

                filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type)

                if sha1 != calculated_sha1:
                    raise  IngestionWorkerException('The sha1 stored is different than the calculated from the received hdf_string')

                #log.warn('writing to filename: %s' % filename)

                with open(filename, mode='wb') as f:
                    f.write(values_string)
                    f.close()
            else:
                log.warn("Nothing to write!")


        return ingestion_attributes

    def on_stop(self):
        TransformDataProcess.on_stop(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()

    def on_quit(self):
        TransformDataProcess.on_quit(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()



    def get_dataset_config(self, incoming_packet):
        """
        Gets the dset_config for the data stream
        """

        try:
            stream_id = incoming_packet.stream_resource_id
        except AttributeError:
            log.info('Packet does not have a data_stream_id: using default policy')
            return None


        dset_config = self.dataset_configs.get(stream_id, None)

        configuration = None
        if dset_config is None:
            log.info('No config found for stream id: %s ' % stream_id)
        else:
            log.info('Got config: %s for stream id: %s' % (dset_config, stream_id))
            configuration = dset_config.configuration

        # return the extracted instruction
        return configuration
Пример #2
0
class IngestionWorker(TransformDataProcess):
    """
    Instances of this class acts as Ingestion Workers. They receive packets and send them to couchdb datastore or
    hdf storage according to the policy in the data stream or the default policy of the ingestion configuration
    """
    def dataset_configs_event_test_hook(self, msg, headers):
        pass

    def ingest_process_test_hook(self, msg, headers):
        pass

    def on_init(self):
        self.event_pub = EventPublisher()

    def on_start(self):
        super(IngestionWorker, self).on_start()
        #----------------------------------------------
        # Start up couch
        #----------------------------------------------

        self.couch_config = self.CFG.get('couch_storage')
        self.hdf_storage = self.CFG.get('hdf_storage')

        self.number_of_workers = self.CFG.get('number_of_workers')
        self.description = self.CFG.get('description')

        self.ingest_config_id = self.CFG.get('configuration_id')

        self.datastore_name = self.couch_config.get('datastore_name',
                                                    None) or 'dm_datastore'
        try:
            self.datastore_profile = getattr(
                DataStore.DS_PROFILE,
                self.couch_config.get('datastore_profile', 'SCIDATA'))
        except AttributeError:
            log.exception(
                'Invalid datastore profile passed to ingestion worker. Defaulting to SCIDATA'
            )

            self.datastore_profile = DataStore.DS_PROFILE.SCIDATA
        log.debug('datastore_profile %s' % self.datastore_profile)
        self.db = self.container.datastore_manager.get_datastore(
            ds_name=self.datastore_name,
            profile=self.datastore_profile,
            config=self.CFG)

        self.resource_reg_client = ResourceRegistryServiceClient(
            node=self.container.node)

        self.dataset_configs = {}

        # update the policy
        def receive_dataset_config_event(event_msg, headers):
            log.info('Updating dataset config in ingestion worker: %s',
                     event_msg)

            if event_msg.type != DatasetIngestionTypeEnum.DATASETINGESTIONBYSTREAM:
                raise IngestionWorkerException(
                    'Received invalid type in dataset config event.')

            stream_id = event_msg.configuration.stream_id

            if event_msg.deleted:
                try:
                    del self.dataset_configs[stream_id]
                except KeyError:
                    log.info(
                        'Tried to remove dataset config that does not exist!')
            else:
                self.dataset_configs[stream_id] = event_msg

            # Hook to override just before processing is complete
            self.dataset_configs_event_test_hook(event_msg, headers)

        #Start the event subscriber - really - what a mess!
        self.event_subscriber = EventSubscriber(
            event_type="DatasetIngestionConfigurationEvent",
            origin=self.ingest_config_id,
            callback=receive_dataset_config_event)

        self.gl = spawn(self.event_subscriber.listen)
        self.event_subscriber._ready_event.wait(timeout=5)

        log.info(str(self.db))

    def process(self, packet):
        """Process incoming data!!!!
        """

        if isinstance(packet, (Granule, CompoundGranule)):
            log.info('Received new granule, but ingestion doesnt work yet!')
            return

        # Ignoring any packet that is not a stream granule!
        if not isinstance(packet, StreamGranuleContainer):
            raise IngestionWorkerException(
                'Received invalid message type: "%s"', type(packet))

        # Get the dataset config for this stream
        dset_config = self.get_dataset_config(packet)

        # Process the packet

        ingest_attributes = self.process_stream(packet, dset_config)

        #@todo - get this data from the dataset config...
        if dset_config:
            dataset_id = dset_config.dataset_id
            stream_id = dset_config.stream_id

            self.event_pub.publish_event(event_type="GranuleIngestedEvent",
                                         sub_type="DatasetIngest",
                                         origin=dataset_id,
                                         status=200,
                                         ingest_attributes=ingest_attributes,
                                         stream_id=stream_id)

            headers = ''
            # Hook to override just before processing is complete
            self.ingest_process_test_hook(packet, headers)

    def persist_immutable(self, obj):
        """
        This method is not functional yet - the doc object is python specific. The sha1 must be of a language independent form.
        """
        doc = self.db._ion_object_to_persistence_dict(obj)
        sha1 = sha1hex(doc)

        try:
            self.db.create_doc(doc, object_id=sha1)
            log.debug('Persisted document %s', type(obj))
        except BadRequest:
            # Deduplication in action!
            #@TODO why are we getting so many duplicate comments?
            log.exception('Failed to write packet!\n%s' % obj)

        # Do the id or revision have a purpose? do we need a return value?

    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """

        ingestion_attributes = {
            'variables': [],
            'number_of_records': -1,
            'updated_metadata': False,
            'updated_data': False
        }

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return

        values_string = ''
        sha1 = ''
        encoding_type = ''

        for key, value in packet.identifiables.iteritems():
            if isinstance(value, DataStream):
                values_string = value.values
                value.values = ''

            elif isinstance(value, Encoding):
                sha1 = value.sha1
                encoding_type = value.encoding_type

            elif isinstance(value, Coverage):
                ingestion_attributes['variables'].append(key)

            elif isinstance(value, CountElement):
                ingestion_attributes['number_of_records'] = value.value

        if dset_config.archive_metadata is True:
            log.debug("Persisting data....")
            ingestion_attributes['updated_metadata'] = True
            self.persist_immutable(packet)

        if dset_config.archive_data is True:
            #@todo - grab the filepath to save the hdf string somewhere..

            ingestion_attributes['updated_data'] = True
            if values_string:

                calculated_sha1 = hashlib.sha1(
                    values_string).hexdigest().upper()

                filename = FileSystem.get_hierarchical_url(
                    FS.CACHE, calculated_sha1, ".%s" % encoding_type)

                if sha1 != calculated_sha1:
                    raise IngestionWorkerException(
                        'The sha1 stored is different than the calculated from the received hdf_string'
                    )

                #log.warn('writing to filename: %s' % filename)

                with open(filename, mode='wb') as f:
                    f.write(values_string)
                    f.close()
            else:
                log.warn("Nothing to write!")

        return ingestion_attributes

    def on_stop(self):
        TransformDataProcess.on_stop(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()

    def on_quit(self):
        TransformDataProcess.on_quit(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()

    def get_dataset_config(self, incoming_packet):
        """
        Gets the dset_config for the data stream
        """

        try:
            stream_id = incoming_packet.stream_resource_id
        except AttributeError:
            log.info(
                'Packet does not have a data_stream_id: using default policy')
            return None

        dset_config = self.dataset_configs.get(stream_id, None)

        configuration = None
        if dset_config is None:
            log.info('No config found for stream id: %s ' % stream_id)
        else:
            log.info('Got config: %s for stream id: %s' %
                     (dset_config, stream_id))
            configuration = dset_config.configuration

        # return the extracted instruction
        return configuration
Пример #3
0
class IngestionWorker(TransformDataProcess):
    """
    Simple implementation of ingestion worker to park Construction period Granules on Disk
    """


    def dataset_configs_event_test_hook(self, msg, headers):
        pass

    def ingest_process_test_hook(self,msg, headers):
        pass

    def on_init(self):
        self.event_pub = EventPublisher()

    def on_start(self):
        super(IngestionWorker,self).on_start()
        #----------------------------------------------
        # Start up couch
        #----------------------------------------------


        self.couch_config = self.CFG.get('couch_storage')
        self.hdf_storage = self.CFG.get('hdf_storage')

        self.number_of_workers = self.CFG.get('number_of_workers')
        self.description = self.CFG.get('description')

        self.ingest_config_id = self.CFG.get('configuration_id')

        self.datastore_name = self.couch_config.get('datastore_name',None) or 'dm_datastore'
        try:
            self.datastore_profile = getattr(DataStore.DS_PROFILE, self.couch_config.get('datastore_profile','SCIDATA'))
        except AttributeError:
            log.exception('Invalid datastore profile passed to ingestion worker. Defaulting to SCIDATA')

            self.datastore_profile = DataStore.DS_PROFILE.SCIDATA
        log.debug('datastore_profile %s' % self.datastore_profile)
        self.db = self.container.datastore_manager.get_datastore(ds_name=self.datastore_name, profile = self.datastore_profile, config = self.CFG)

        self.resource_reg_client = ResourceRegistryServiceClient(node = self.container.node)

        self.dataset_configs = {}
        # update the policy


        #Start the event subscriber - really - what a mess!
        self.event_subscriber = EventSubscriber(
            event_type="DatasetIngestionConfigurationEvent",
            origin=self.ingest_config_id,
            callback=self.receive_dataset_config_event
        )

        self.gl = spawn(self.event_subscriber.listen)
        self.event_subscriber._ready_event.wait(timeout=5)

        log.info(str(self.db))

    def receive_dataset_config_event(self, event_msg, headers):
        log.info('Updating dataset config in ingestion worker: %s', event_msg)

        if event_msg.type != DatasetIngestionTypeEnum.DATASETINGESTIONBYSTREAM:
            raise IngestionWorkerException('Received invalid type in dataset config event.')

        stream_id = event_msg.configuration.stream_id

        if event_msg.deleted:
            try:
                del self.dataset_configs[stream_id]
            except KeyError:
                log.info('Tried to remove dataset config that does not exist!')
        else:
            self.dataset_configs[stream_id] = event_msg

        # Hook to override just before processing is complete
        self.dataset_configs_event_test_hook(event_msg, headers)

    def process(self, packet):
        """Process incoming data!!!!
        """

        if not isinstance(packet, Granule):
            log.info('Received a packet that is not a new granule!')
            return

        # Get the dataset config for this stream
        dset_config = self.get_dataset_config(packet)

        # Process the packet

        ingest_attributes = self.process_stream(packet, dset_config)


        #@todo - get this data from the dataset config...
        if dset_config:
            dataset_id = dset_config.dataset_id
            stream_id = dset_config.stream_id

            self.event_pub.publish_event(event_type="GranuleIngestedEvent", sub_type="DatasetIngest",
                origin=dataset_id, status=200,
                ingest_attributes=ingest_attributes, stream_id=stream_id)


            headers = ''
            # Hook to override just before processing is complete
            self.ingest_process_test_hook(packet, headers)


    def persist_immutable(self, obj):
        """
        This method is not functional yet - the doc object is python specific. The sha1 must be of a language independent form.
        """

        return self.db.create_doc(obj)


    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """


        ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False}

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return


        # Get back to the serialized form - the process receives only the IonObject after the interceptor stack has decoded it...
        simple_dict = ion_serializer.serialize(packet) #packet is an ion_object
        byte_string = msgpack.packb(simple_dict, default=encode_ion)

        encoding_type = 'ion_msgpack'

        # Persisted sha1 is crafted from the byte string msgpack creates
        calculated_sha1 = hashlib.sha1(byte_string).hexdigest().upper()

        dataset_granule = {
            'stream_id'      : dset_config.stream_id,
            'dataset_id'     : dset_config.dataset_id,
            'persisted_sha1' : calculated_sha1,
            'encoding_type'  : encoding_type,
            'ts_create'      : get_ion_ts()
        }


        self.persist_immutable(dataset_granule)





        filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type)

        with open(filename, mode='wb') as f:
            f.write(byte_string)
            f.close()


        return ingestion_attributes

    def on_stop(self):
        TransformDataProcess.on_stop(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()

    def on_quit(self):
        TransformDataProcess.on_quit(self)

        # close event subscriber safely
        self.event_subscriber.close()
        self.gl.join(timeout=5)
        self.gl.kill()

        self.db.close()



    def get_dataset_config(self, incoming_packet):
        """
        Gets the dset_config for the data stream
        """

        try:
            # For now - use the data_producer_id field as a stream_id to get us moving
            #@todo fix the missmatch between producer_id and stream_id!
            stream_id = incoming_packet.data_producer_id
        except AttributeError:
            log.info('Packet does not have a data_stream_id: using default policy')
            return None


        dset_config = self.dataset_configs.get(stream_id, None)

        configuration = None
        if dset_config is None:
            log.info('No config found for stream id: %s ' % stream_id)
        else:
            log.info('Got config: %s for stream id: %s' % (dset_config, stream_id))
            configuration = dset_config.configuration

        # return the extracted instruction
        return configuration