예제 #1
0
class QCPostProcessing(SimpleProcess):
    '''
    QC Post Processing Process

    This process provides the capability to ION clients and operators to evaluate the automated quality control flags on
    various data products. This process should be run periodically with overlapping spans of data to ensure complete
    dataset QC verification.

    This parameters that this process accepts as configurations are:
        - dataset_id: The dataset identifier, required.
        - start_time: Unix timestamp, defaults to 24 hours in the past
        - end_time: Unix timestamp, defaults to current time
        - qc_params: a list of qc functions to evaluate, currently supported functions are: ['glblrng_qc',
          'spketst_qc', 'stuckvl_qc'], defaults to all

    '''

    qc_suffixes = ['glblrng_qc', 'spketst_qc', 'stuckvl_qc']
    def on_start(self):
        SimpleProcess.on_start(self)
        self.data_retriever = DataRetrieverServiceProcessClient(process=self)
        self.interval_key = self.CFG.get_safe('process.interval_key',None)
        self.qc_params    = self.CFG.get_safe('process.qc_params',[])
        validate_is_not_none(self.interval_key, 'An interval key is necessary to paunch this process')
        self.event_subscriber = EventSubscriber(event_type=OT.TimerEvent, origin=self.interval_key, callback=self._event_callback, auto_delete=True)
        self.add_endpoint(self.event_subscriber)
        self.resource_registry = self.container.resource_registry
        self.run_interval = self.CFG.get_safe('service.qc_processing.run_interval', 24)
    
    def _event_callback(self, *args, **kwargs):
        log.info('QC Post Processing Triggered')
        dataset_ids, _ = self.resource_registry.find_resources(restype=RT.Dataset, id_only=True)
        for dataset_id in dataset_ids:
            log.info('QC Post Processing for dataset %s', dataset_id)
            try:
                self.process(dataset_id)
            except BadRequest as e:
                if 'Problems reading from the coverage' in e.message:
                    log.error('Failed to read from dataset %s', dataset_id, exc_info=True)

    def process(self, dataset_id, start_time=0, end_time=0):
        if not dataset_id:
            raise BadRequest('No dataset id specified.')
        now = time.time()
        start_time = start_time or (now - (3600*(self.run_interval+1))) # Every N hours with 1 of overlap
        end_time   = end_time or now
        
        qc_params  = [i for i in self.qc_params if i in self.qc_suffixes] or self.qc_suffixes
        
        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        log.debug('Iterating over the data blocks')

        for st,et in self.chop(int(start_time),int(end_time)):
            log.debug('Chopping %s:%s', st, et)
            log.debug("Retrieving data: data_retriever.retrieve('%s', query={'start_time':%s, 'end_time':%s')", dataset_id, st, et)
            try:
                granule = self.data_retriever.retrieve(dataset_id, query={'start_time':st, 'end_time':et})
            except BadRequest:
                data_products, _ = self.container.resource_registry.find_subjects(object=dataset_id, predicate=PRED.hasDataset, subject_type=RT.DataProduct)
                for data_product in data_products:
                    log.exception('Failed to perform QC Post Processing on %s', data_product.name)
                    log.error('Calculated Start Time: %s', st)
                    log.error('Calculated End Time:   %s', et)
                raise
            log.debug('Retrieved Data')
            rdt = RecordDictionaryTool.load_from_granule(granule)
            qc_fields = [i for i in rdt.fields if any([i.endswith(j) for j in qc_params])]
            log.debug('QC Fields: %s', qc_fields)
            for field in qc_fields:
                val = rdt[field]
                if val is None:
                    continue
                if not np.all(val):
                    log.debug('Found QC Alerts')
                    indexes = np.where(val==0)
                    timestamps = rdt[rdt.temporal_parameter][indexes[0]]
                    self.flag_qc_parameter(dataset_id, field, timestamps.tolist(),{})



    def flag_qc_parameter(self, dataset_id, parameter, temporal_values, configuration):
        log.info('Flagging QC for %s', parameter)
        data_product_ids, _ = self.resource_registry.find_subjects(object=dataset_id, subject_type=RT.DataProduct, predicate=PRED.hasDataset, id_only=True)
        for data_product_id in data_product_ids:
            self.qc_publisher.publish_event(origin=data_product_id, qc_parameter=parameter, temporal_values=temporal_values, configuration=configuration)

    @classmethod
    def chop(cls, start_time, end_time):
        while start_time < end_time:
            yield (start_time, min(start_time+3600, end_time))
            start_time = min(start_time+3600, end_time)
        return
예제 #2
0
class QCPostProcessing(SimpleProcess):
    '''
    QC Post Processing Process

    This process provides the capability to ION clients and operators to evaluate the automated quality control flags on
    various data products. This process should be run periodically with overlapping spans of data to ensure complete
    dataset QC verification.

    This parameters that this process accepts as configurations are:
        - dataset_id: The dataset identifier, required.
        - start_time: Unix timestamp, defaults to 24 hours in the past
        - end_time: Unix timestamp, defaults to current time
        - qc_params: a list of qc functions to evaluate, currently supported functions are: ['glblrng_qc',
          'spketst_qc', 'stuckvl_qc'], defaults to all

    '''

    qc_suffixes = ['glblrng_qc', 'spketst_qc', 'stuckvl_qc']

    def on_start(self):
        SimpleProcess.on_start(self)
        self.data_retriever = DataRetrieverServiceProcessClient(process=self)
        self.interval_key = self.CFG.get_safe('process.interval_key', None)
        self.qc_params = self.CFG.get_safe('process.qc_params', [])
        validate_is_not_none(
            self.interval_key,
            'An interval key is necessary to paunch this process')
        self.event_subscriber = EventSubscriber(event_type=OT.TimerEvent,
                                                origin=self.interval_key,
                                                callback=self._event_callback,
                                                auto_delete=True)
        self.add_endpoint(self.event_subscriber)
        self.resource_registry = self.container.resource_registry
        self.run_interval = self.CFG.get_safe(
            'service.qc_processing.run_interval', 24)

    def _event_callback(self, *args, **kwargs):
        log.info('QC Post Processing Triggered')
        dataset_ids, _ = self.resource_registry.find_resources(
            restype=RT.Dataset, id_only=True)
        for dataset_id in dataset_ids:
            log.info('QC Post Processing for dataset %s', dataset_id)
            try:
                self.process(dataset_id)
            except BadRequest as e:
                if 'Problems reading from the coverage' in e.message:
                    log.error('Failed to read from dataset')

    def process(self, dataset_id, start_time=0, end_time=0):
        if not dataset_id:
            raise BadRequest('No dataset id specified.')
        now = time.time()
        start_time = start_time or (now - (3600 * (self.run_interval + 1))
                                    )  # Every N hours with 1 of overlap
        end_time = end_time or now

        qc_params = [i for i in self.qc_params if i in self.qc_suffixes
                     ] or self.qc_suffixes

        self.qc_publisher = EventPublisher(event_type=OT.ParameterQCEvent)
        log.debug('Iterating over the data blocks')

        for st, et in self.chop(int(start_time), int(end_time)):
            log.debug('Chopping %s:%s', st, et)
            log.debug(
                "Retrieving data: data_retriever.retrieve('%s', query={'start_time':%s, 'end_time':%s')",
                dataset_id, st, et)
            granule = self.data_retriever.retrieve(dataset_id,
                                                   query={
                                                       'start_time': st,
                                                       'end_time': et
                                                   })
            log.debug('Retrieved Data')
            rdt = RecordDictionaryTool.load_from_granule(granule)
            qc_fields = [
                i for i in rdt.fields
                if any([i.endswith(j) for j in qc_params])
            ]
            log.debug('QC Fields: %s', qc_fields)
            for field in qc_fields:
                val = rdt[field]
                if val is None:
                    continue
                if not np.all(val):
                    log.debug('Found QC Alerts')
                    indexes = np.where(val == 0)
                    timestamps = rdt[rdt.temporal_parameter][indexes[0]]
                    self.flag_qc_parameter(dataset_id, field,
                                           timestamps.tolist(), {})

    def flag_qc_parameter(self, dataset_id, parameter, temporal_values,
                          configuration):
        log.info('Flagging QC for %s', parameter)
        data_product_ids, _ = self.resource_registry.find_subjects(
            object=dataset_id,
            subject_type=RT.DataProduct,
            predicate=PRED.hasDataset,
            id_only=True)
        for data_product_id in data_product_ids:
            self.qc_publisher.publish_event(origin=data_product_id,
                                            qc_parameter=parameter,
                                            temporal_values=temporal_values,
                                            configuration=configuration)

    @classmethod
    def chop(cls, start_time, end_time):
        while start_time < end_time:
            yield (start_time, min(start_time + 3600, end_time))
            start_time = min(start_time + 3600, end_time)
        return
예제 #3
0
class VizTransformMatplotlibGraphs(TransformStreamPublisher, TransformEventListener, TransformStreamListener):

    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    output_bindings = ['graph_image_param_dict']
    event_timer_interval = None


    def on_start(self):
        #print ">>>>>>>>>>>>>>>>>>>>>> MPL CFG = ", self.CFG

        self.pubsub_management = PubsubManagementServiceProcessClient(process=self)
        self.ssclient = SchedulerServiceProcessClient(process=self)
        self.rrclient = ResourceRegistryServiceProcessClient(process=self)
        self.data_retriever_client = DataRetrieverServiceProcessClient(process=self)
        self.dsm_client = DatasetManagementServiceProcessClient(process=self)
        self.pubsub_client = PubsubManagementServiceProcessClient(process = self)

        self.stream_info  = self.CFG.get_safe('process.publish_streams',{})
        self.stream_names = self.stream_info.keys()
        self.stream_ids   = self.stream_info.values()

        if not self.stream_names:
            raise BadRequest('MPL Transform has no output streams.')

        graph_time_periods= self.CFG.get_safe('graph_time_periods')

        # If this is meant to be an event driven process, schedule an event to be generated every few minutes/hours
        self.event_timer_interval = self.CFG.get_safe('graph_gen_interval')
        if self.event_timer_interval:
            event_origin = "Interval_Timer_Matplotlib"
            sub = EventSubscriber(event_type="ResourceEvent", callback=self.interval_timer_callback, origin=event_origin)
            sub.start()

            self.interval_timer_id = self.ssclient.create_interval_timer(start_time="now" , interval=self._str_to_secs(self.event_timer_interval),
                event_origin=event_origin, event_subtype="")

        super(VizTransformMatplotlibGraphs,self).on_start()

    # when tranform is used as a data process
    def recv_packet(self, packet, in_stream_route, in_stream_id):
        #Check to see if the class instance was set up as a event triggered transform. If yes, skip the packet
        if self.event_timer_interval:
            return

        log.info('Received packet')
        mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(packet, params=self.get_stream_definition())
        for stream_name in self.stream_names:
            publisher = getattr(self, stream_name)
            publisher.publish(mpl_data_granule)

    def get_stream_definition(self):
        stream_id = self.stream_ids[0]
        stream_def = self.pubsub_management.read_stream_definition(stream_id=stream_id)
        return stream_def._id

    def process_event(self, msg, headers):

        return

    def interval_timer_callback(self, *args, **kwargs):
        #Find out the input data product to this process
        in_dp_id = self.CFG.get_safe('in_dp_id')

        print " >>>>>>>>>>>>>> IN DP ID from cfg : ", in_dp_id

        # get the dataset_id associated with the data_product. Need it to do the data retrieval
        ds_ids,_ = self.rrclient.find_objects(in_dp_id, PRED.hasDataset, RT.Dataset, True)
        if ds_ids is None or not ds_ids:
            return None

        # retrieve data for the specified time interval. Setup up query from pass config first
        query = {}

        param_list_str = self.CFG.get_safe('parameters')
        if param_list_str:
            query['parameters'] = param_list_str.split(', ')
            # append time if not present in list of parameters
            if not 'time' in query['parameters']:
                query['parameters'].append('time')


        query['start_time'] = query['end_time'] = ntplib.system_to_ntp_time(time.time()) # Now
        query['stride_time'] = 1
        if self.CFG.get_safe('graph_time_period'):
            query['start_time'] = query['end_time'] - self._str_to_secs(self.CFG.get_safe('graph_time_period'))

        #print " >>>>>>>>>>>>>> QUERY = ", query

        #retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0],{'start_time':start_time,'end_time':end_time})
        retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0], query=query)

        # add extra parameters to query passed in config that are not needed by data retrieval
        if self.CFG.get_safe('resolution'):
            query['resolution'] = self.CFG.get_safe('resolution')

        # send the granule through the Algorithm code to get the matplotlib graphs
        mpl_pdict_id = self.dsm_client.read_parameter_dictionary_by_name('graph_image_param_dict',id_only=True)

        mpl_stream_def = self.pubsub_client.create_stream_definition('mpl', parameter_dictionary_id=mpl_pdict_id)
        fileName = self.CFG.get_safe('graph_time_period')
        mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(retrieved_granule, config=query, params=mpl_stream_def, fileName=fileName)

        if mpl_data_granule == None:
            return None

        # publish on all specified output streams
        for stream_name in self.stream_names:
            publisher = getattr(self, stream_name)
            publisher.publish(mpl_data_granule)

        return

    def _str_to_secs(self, time_period):
        # this method converts commonly used time periods to its actual seconds counterpart
        #separate alpha and numeric parts of the time period
        time_n = time_period.lower().rstrip('abcdefghijklmnopqrstuvwxyz ')
        time_a = time_period.lower().lstrip('0123456789. ')

        # determine if user specified, secs, mins, hours, days, weeks, months, years
        factor = None
        if time_a == 'sec' or time_a == "secs" or time_a == 'second' or time_a == "seconds":
            factor = 1
        if time_a == "min" or time_a == "mins" or time_a == "minute" or time_a == "minutes":
            factor = 60
        if time_a == "hr" or time_a == "hrs" or time_a == "hour" or time_a == "hours":
            factor = 60 * 60
        if time_a == "day" or time_a == "days":
            factor = 60 * 60 * 24
        if time_a == "wk" or time_a == "wks" or time_a == "week" or time_a == "weeks":
            factor = 60 * 60 * 24 * 7
        if time_a == "mon" or time_a == "mons" or time_a == "month" or time_a == "months":
            factor = 60 * 60 * 24 * 30
        if time_a == "yr" or time_a == "yrs" or time_a == "year" or time_a == "years":
            factor = 60 * 60 * 24 * 365

        time_period_secs = float(time_n) * factor
        return time_period_secs


    def on_quit(self):

        #Cancel the timer
        if hasattr(self, 'interval_timer_id'):
            self.ssclient.cancel_timer(self.interval_timer_id)

        super(VizTransformMatplotlibGraphs,self).on_quit()
예제 #4
0
class VizTransformMatplotlibGraphs(TransformStreamPublisher,
                                   TransformEventListener,
                                   TransformStreamListener):
    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    output_bindings = ['graph_image_param_dict']
    event_timer_interval = None

    def on_start(self):
        #print ">>>>>>>>>>>>>>>>>>>>>> MPL CFG = ", self.CFG

        self.pubsub_management = PubsubManagementServiceProcessClient(
            process=self)
        self.ssclient = SchedulerServiceProcessClient(process=self)
        self.rrclient = ResourceRegistryServiceProcessClient(process=self)
        self.data_retriever_client = DataRetrieverServiceProcessClient(
            process=self)
        self.dsm_client = DatasetManagementServiceProcessClient(process=self)
        self.pubsub_client = PubsubManagementServiceProcessClient(process=self)

        self.stream_info = self.CFG.get_safe('process.publish_streams', {})
        self.stream_names = self.stream_info.keys()
        self.stream_ids = self.stream_info.values()

        if not self.stream_names:
            raise BadRequest('MPL Transform has no output streams.')

        graph_time_periods = self.CFG.get_safe('graph_time_periods')

        # If this is meant to be an event driven process, schedule an event to be generated every few minutes/hours
        self.event_timer_interval = self.CFG.get_safe('graph_gen_interval')
        if self.event_timer_interval:
            event_origin = "Interval_Timer_Matplotlib"
            sub = EventSubscriber(event_type="ResourceEvent",
                                  callback=self.interval_timer_callback,
                                  origin=event_origin)
            sub.start()

            self.interval_timer_id = self.ssclient.create_interval_timer(
                start_time="now",
                interval=self._str_to_secs(self.event_timer_interval),
                event_origin=event_origin,
                event_subtype="")

        super(VizTransformMatplotlibGraphs, self).on_start()

    # when tranform is used as a data process
    def recv_packet(self, packet, in_stream_route, in_stream_id):
        #Check to see if the class instance was set up as a event triggered transform. If yes, skip the packet
        if self.event_timer_interval:
            return

        log.info('Received packet')
        mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(
            packet, params=self.get_stream_definition())
        for stream_name in self.stream_names:
            publisher = getattr(self, stream_name)
            publisher.publish(mpl_data_granule)

    def get_stream_definition(self):
        stream_id = self.stream_ids[0]
        stream_def = self.pubsub_management.read_stream_definition(
            stream_id=stream_id)
        return stream_def._id

    def process_event(self, msg, headers):

        return

    def interval_timer_callback(self, *args, **kwargs):
        #Find out the input data product to this process
        in_dp_id = self.CFG.get_safe('in_dp_id')

        print " >>>>>>>>>>>>>> IN DP ID from cfg : ", in_dp_id

        # get the dataset_id associated with the data_product. Need it to do the data retrieval
        ds_ids, _ = self.rrclient.find_objects(in_dp_id, PRED.hasDataset,
                                               RT.Dataset, True)
        if ds_ids is None or not ds_ids:
            return None

        # retrieve data for the specified time interval. Setup up query from pass config first
        query = {}

        param_list_str = self.CFG.get_safe('parameters')
        if param_list_str:
            query['parameters'] = param_list_str.split(', ')
            # append time if not present in list of parameters
            if not 'time' in query['parameters']:
                query['parameters'].append('time')

        query['start_time'] = query['end_time'] = ntplib.system_to_ntp_time(
            time.time())  # Now
        query['stride_time'] = 1
        if self.CFG.get_safe('graph_time_period'):
            query['start_time'] = query['end_time'] - self._str_to_secs(
                self.CFG.get_safe('graph_time_period'))

        #print " >>>>>>>>>>>>>> QUERY = ", query

        #retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0],{'start_time':start_time,'end_time':end_time})
        retrieved_granule = self.data_retriever_client.retrieve(ds_ids[0],
                                                                query=query)

        # add extra parameters to query passed in config that are not needed by data retrieval
        if self.CFG.get_safe('resolution'):
            query['resolution'] = self.CFG.get_safe('resolution')

        # send the granule through the Algorithm code to get the matplotlib graphs
        mpl_pdict_id = self.dsm_client.read_parameter_dictionary_by_name(
            'graph_image_param_dict', id_only=True)

        mpl_stream_def = self.pubsub_client.create_stream_definition(
            'mpl', parameter_dictionary_id=mpl_pdict_id)
        fileName = self.CFG.get_safe('graph_time_period')
        mpl_data_granule = VizTransformMatplotlibGraphsAlgorithm.execute(
            retrieved_granule,
            config=query,
            params=mpl_stream_def,
            fileName=fileName)

        if mpl_data_granule == None:
            return None

        # publish on all specified output streams
        for stream_name in self.stream_names:
            publisher = getattr(self, stream_name)
            publisher.publish(mpl_data_granule)

        return

    def _str_to_secs(self, time_period):
        # this method converts commonly used time periods to its actual seconds counterpart
        #separate alpha and numeric parts of the time period
        time_n = time_period.lower().rstrip('abcdefghijklmnopqrstuvwxyz ')
        time_a = time_period.lower().lstrip('0123456789. ')

        # determine if user specified, secs, mins, hours, days, weeks, months, years
        factor = None
        if time_a == 'sec' or time_a == "secs" or time_a == 'second' or time_a == "seconds":
            factor = 1
        if time_a == "min" or time_a == "mins" or time_a == "minute" or time_a == "minutes":
            factor = 60
        if time_a == "hr" or time_a == "hrs" or time_a == "hour" or time_a == "hours":
            factor = 60 * 60
        if time_a == "day" or time_a == "days":
            factor = 60 * 60 * 24
        if time_a == "wk" or time_a == "wks" or time_a == "week" or time_a == "weeks":
            factor = 60 * 60 * 24 * 7
        if time_a == "mon" or time_a == "mons" or time_a == "month" or time_a == "months":
            factor = 60 * 60 * 24 * 30
        if time_a == "yr" or time_a == "yrs" or time_a == "year" or time_a == "years":
            factor = 60 * 60 * 24 * 365

        time_period_secs = float(time_n) * factor
        return time_period_secs

    def on_quit(self):

        #Cancel the timer
        if hasattr(self, 'interval_timer_id'):
            self.ssclient.cancel_timer(self.interval_timer_id)

        super(VizTransformMatplotlibGraphs, self).on_quit()