Exemplos de KafkaConsumer em Python, exemplos de aet.kafka.KafkaConsumer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: artifacts.py Projeto: shawnsarwar/aether-firebase-consumer

 def _setup(self):
     self.subscribed_topics = {}
     self._schemas = {}
     self._subscriptions = []
     self._previous_topics = []
     self.log_stack = []
     self.log = callback_logger('JOB', self.log_stack, 100)
     self.group_name = f'{self.tenant}.firebaseconsumer.{self._id}'
     self.sleep_delay: float = 0.5
     self.report_interval: int = 100
     args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
     args['group.id'] = self.group_name
     LOG.debug(args)
     self.consumer = KafkaConsumer(**args)

Exemplo n.º 2

0

Exibir arquivo

 def _setup(self):
     self.subscribed_topics = {}
     self._indices = {}
     self._schemas = {}
     self._processors = {}
     self._doc_types = {}
     self._routes = {}
     self._subscriptions = []
     self._previous_topics = []
     self.group_name = f'{self.tenant}.esconsumer.{self._id}'
     self.sleep_delay: float = 0.5
     self.report_interval: int = 100
     args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
     args['group.id'] = self.group_name
     LOG.debug(args)
     self.consumer = KafkaConsumer(**args)

Exemplo n.º 3

0

Exibir arquivo

    def __make_kafka_getter(self):
        args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
        # the usual Kafka Client Configuration
        args['group.id'] = self.kafka_group_id
        args['auto.offset.reset'] = \
            self.definition['kafka_subscription'].get('auto_offset_reset', 'earliest')
        self.kafka_consumer = KafkaConsumer(**args)
        pattern = self.definition['kafka_subscription'].get(
            'topic_pattern', '*')
        # only allow regex on the end of patterns
        if pattern.endswith('*'):
            topic = f'^{self.tenant}.{pattern}'
        else:
            topic = f'{self.tenant}.{pattern}'
        self.kafka_consumer.subscribe([topic], on_assign=self._on_kafka_assign)

        def _getter() -> Iterable[KafkaMessage]:
            messages = self.kafka_consumer.poll_and_deserialize(timeout=5,
                                                                num_messages=1)
            for msg in messages:
                yield KafkaMessage(msg)

        return _getter

Exemplo n.º 4

0

Exibir arquivo

def offline_consumer():
    consumer = None

    def set_config(self, new_configs):
        self.config = new_configs

    def add_config(self, pairs):
        for k, v in pairs.items():
            self.config[k] = v
    # Mock up a usable KafkaConsumer that doesn't use Kafka...
    with mock.patch('aet.kafka.KafkaConsumer.__init__') as MKafka:
        MKafka.return_value = None  # we need to ignore the call to super in __init__
        consumer = KafkaConsumer()
    consumer._set_config = set_config.__get__(consumer)
    consumer._add_config = add_config.__get__(consumer)
    # somehow the ADDITIONAL_CONFIG changes if you pass it directly.
    # Leave this deepcopy
    _configs = deepcopy(KafkaConsumer.ADDITIONAL_CONFIG)
    _configs['aether_emit_flag_required'] = True  # we don't need to test the all_pass state
    consumer._set_config(_configs)
    return consumer

Exemplo n.º 5

0

Exibir arquivo

Arquivo: artifacts.py Projeto: shawnsarwar/aether-firebase-consumer

class FirebaseJob(BaseJob):
    name = 'job'
    # Any type here needs to be registered in the API as APIServer._allowed_types
    _resources = [FirebaseInstance, Subscription]
    schema = schemas.FB_JOB

    public_actions = BaseJob.public_actions + [
        'get_logs',
        'list_topics',
        'list_subscribed_topics'
    ]
    # publicly available list of topics
    subscribed_topics: dict
    log_stack: list
    log: Callable  # each job instance has it's own log object to keep log_stacks -> user reportable

    consumer: KafkaConsumer = None
    # processing artifacts
    _indices: dict
    _schemas: dict
    _previous_topics: list
    _firebase: FirebaseInstance
    _subscriptions: List[Subscription]

    def _setup(self):
        self.subscribed_topics = {}
        self._schemas = {}
        self._subscriptions = []
        self._previous_topics = []
        self.log_stack = []
        self.log = callback_logger('JOB', self.log_stack, 100)
        self.group_name = f'{self.tenant}.firebaseconsumer.{self._id}'
        self.sleep_delay: float = 0.5
        self.report_interval: int = 100
        args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
        args['group.id'] = self.group_name
        LOG.debug(args)
        self.consumer = KafkaConsumer(**args)

    def _job_firebase(self, config=None) -> FirebaseInstance:
        if config:
            fb: List[FirebaseInstance] = self.get_resources('firebase', config)
            if not fb:
                raise ConsumerHttpException('No Firebase associated with Job', 400)
            self._firebase = fb[0]
        return self._firebase

    def _job_subscriptions(self, config=None) -> List[Subscription]:
        if config:
            subs = self.get_resources('subscription', config)
            if not subs:
                raise ConsumerHttpException('No Subscriptions associated with Job', 400)
            self._subscriptions = subs
        return self._subscriptions

    def _job_subscription_for_topic(self, topic) -> Subscription:
        return next(iter(
            sorted([
                i for i in self._job_subscriptions()
                if i._handles_topic(topic, self.tenant)
            ])),
            None)

    def _test_connections(self, config):
        self._job_subscriptions(config)
        self._job_firebase(config).test_connection()  # raises CHE
        return True

    def _get_messages(self, config):
        try:
            self.log.debug(f'{self._id} checking configurations...')
            self._test_connections(config)
            subs = self._job_subscriptions()
            self._handle_new_subscriptions(subs)
            self.log.debug(f'Job {self._id} getting messages')
            return self.consumer.poll_and_deserialize(
                timeout=5,
                num_messages=1)  # max
        except ConsumerHttpException as cer:
            # don't fetch messages if we can't post them
            self.log.debug(f'Job not ready: {cer}')
            self.status = JobStatus.RECONFIGURE
            sleep(self.sleep_delay * 10)
            return []
        except Exception as err:
            import traceback
            traceback_str = ''.join(traceback.format_tb(err.__traceback__))
            self.log.critical(f'unhandled error: {str(err)} | {traceback_str}')
            raise err
            sleep(self.sleep_delay)
            return []

    def _handle_new_subscriptions(self, subs):
        old_subs = list(sorted(set(self.subscribed_topics.values())))
        for sub in subs:
            pattern = sub.definition.topic_pattern
            # only allow regex on the end of patterns
            if pattern.endswith('*'):
                self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}'
            else:
                self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}'
        new_subs = list(sorted(set(self.subscribed_topics.values())))
        _diff = list(set(old_subs).symmetric_difference(set(new_subs)))
        if _diff:
            self.log.info(f'{self.tenant} added subs to topics: {_diff}')
            self.consumer.subscribe(new_subs, on_assign=self._on_assign)

    def _handle_messages(self, config, messages):
        self.log.debug(f'{self.group_name} | reading {len(messages)} messages')
        MAX_SUBMIT = 50
        count = 0
        subs = {}
        firebase: FirebaseInstance = self._job_firebase()
        cfs: firestore.Client = firebase.get_cloud_firestore()
        batch = cfs.batch()
        for msg in messages:
            topic = msg.topic
            if topic not in subs:
                subs[topic] = self._job_subscription_for_topic(topic)
            schema = msg.schema
            if schema != self._schemas.get(topic):
                self.log.info(f'{self._id} Schema change on {topic}')
                self._update_topic(topic, schema)
                self._schemas[topic] = schema
            else:
                self.log.debug('Schema unchanged.')
            self.add_message(msg.value, topic, subs[topic], cfs, batch)
            count += 1
            if (count % MAX_SUBMIT) == 0:
                batch.commit()
                batch = cfs.batch()
        batch.commit()
        self.log.info(f'processed {count} {topic} docs in tenant {self.tenant}')

    # called when a subscription causes a new assignment to be given to the consumer
    def _on_assign(self, *args, **kwargs):
        # TODO check rules for topic in Firebase
        assignment = args[1]
        for _part in assignment:
            if _part.topic not in self._previous_topics:
                self.log.info(f'New topic to configure: {_part.topic}')
                self._apply_consumer_filters(_part.topic)
                self._previous_topics.append(_part.topic)

    def _apply_consumer_filters(self, topic):
        self.log.debug(f'{self._id} applying filter for new topic {topic}')
        subscription = self._job_subscription_for_topic(topic)
        if not subscription:
            self.log.error(f'Could not find subscription for topic {topic}')
            return
        try:
            opts = subscription.definition.topic_options
            _flt = opts.get('filter_required', False)
            if _flt:
                _filter_options = {
                    'check_condition_path': opts.get('filter_field_path', ''),
                    'pass_conditions': opts.get('filter_pass_values', []),
                    'requires_approval': _flt
                }
                self.log.info(_filter_options)
                self.consumer.set_topic_filter_config(
                    topic,
                    FilterConfig(**_filter_options)
                )
            mask_annotation = opts.get('masking_annotation', None)
            if mask_annotation:
                _mask_options = {
                    'mask_query': mask_annotation,
                    'mask_levels': opts.get('masking_levels', []),
                    'emit_level': opts.get('masking_emit_level')
                }
                self.log.info(_mask_options)
                self.consumer.set_topic_mask_config(
                    topic,
                    MaskConfig(**_mask_options)
                )
            self.log.info(f'Filters applied for topic {topic}')
        except AttributeError as aer:
            self.log.error(f'No topic options for {subscription.id}| {aer}')

    def _name_from_topic(self, topic):
        return topic.lstrip(f'{self.tenant}.')

    def _update_topic(self, topic, schema: Mapping[Any, Any]):
        self.log.debug(f'{self.tenant} is updating topic schema: {topic},'
                       f' firebase does not care...')

    def add_message(
        self,
        doc,
        topic: str,
        sub: Subscription,
        cfs: firestore.Client,
        batch: firestore_v1.batch.WriteBatch
    ):
        mode: helpers.SyncMode = sub.sync_mode()
        if mode in [helpers.SyncMode.NONE, helpers.SyncMode.CONSUME]:
            # these don't send data to FB
            return None
        topic_name = self._name_from_topic(topic)
        path = sub.path_for_topic(topic_name)
        if mode is helpers.SyncMode.SYNC:
            # todo:
            # check the hash..
            # needs_update = something()
            # if needs_update:
            #     return helpers.cfs_ref(cfs, path, doc.get('id'))
            pass
        elif mode is helpers.SyncMode.FORWARD:
            # we don't care about the hashes
            ref = helpers.cfs_ref(cfs, path, doc.get('id'))
            batch.set(ref, doc)

    # public
    def list_topics(self, *args, **kwargs):
        '''
        Get a list of topics to which the job can subscribe.
        You can also use a wildcard at the end of names like:
        Name* which would capture both Name1 && Name2, etc
        '''
        timeout = 5
        try:
            md = self.consumer.list_topics(timeout=timeout)
        except (KafkaException) as ker:
            raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500)
        topics = [
            str(t).split(f'{self.tenant}.')[1] for t in iter(md.topics.values())
            if str(t).startswith(self.tenant)
        ]
        return topics

    # public
    def list_subscribed_topics(self, *arg, **kwargs):
        '''
        A List of topics currently subscribed to by this job
        '''
        return list(self.subscribed_topics.values())

    # public
    def get_logs(self, *arg, **kwargs):
        '''
        A list of the last 100 log entries from this job in format
        [
            (timestamp, log_level, message),
            (timestamp, log_level, message),
            ...
        ]
        '''
        return self.log_stack[:]

Exemplo n.º 6

0

Exibir arquivo

class PipelinePubSub(object):
    def __init__(
            self,
            tenant: str,
            kafka_group=None,
            definition: Dict = None,
            zeebe: 'ZeebeInstance' = None  # noqa  type hint
    ):
        self.tenant = tenant
        self.kafka_group_id = kafka_group
        self.definition = definition
        self.kafka_consumer = None
        self.kafka_producer = self.__get_kafka_producer()
        self.source_type = None
        self.zeebe: 'ZeebeInstance' = zeebe  # noqa
        self.zeebe_connection: ZeebeConnection = None

    def _make_context(self, evt: Event):
        data = {}
        if self.definition.get('const'):
            data = {'const': self.definition['const']}
        if not self.zeebe_connection:
            self.zeebe_connection = (None if not self.zeebe else
                                     self.zeebe.get_connection())
        return PipelineContext(evt,
                               zeebe=self.zeebe_connection,
                               kafka_producer=self.kafka_producer,
                               data=data)

    def commit(self):
        if self.kafka_consumer:
            self.kafka_consumer.commit()

    def __has_kafka_setter(self) -> bool:
        if 'error_handling' in self.definition:
            return True
        return 'kafkamessage' in set(
            [stage.get('type') for stage in self.definition.get('stages', [])])

    def __get_kafka_producer(self) -> KafkaProducer:
        if self.__has_kafka_setter():
            return KafkaProducer(**get_kafka_admin_config())

    def __message_getter(self):
        if not self.source_type:
            if 'kafka_subscription' in self.definition:
                self.source_type = PipelineConnection.KAFKA
                self.__source = self.__make_kafka_getter()

            elif 'zeebe_subscription' in self.definition:
                self.source_type = PipelineConnection.ZEEBE
                self.__source = self.__make_zeebe_getter()

        return self.__source

    def __make_kafka_getter(self):
        args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
        # the usual Kafka Client Configuration
        args['group.id'] = self.kafka_group_id
        args['auto.offset.reset'] = \
            self.definition['kafka_subscription'].get('auto_offset_reset', 'earliest')
        self.kafka_consumer = KafkaConsumer(**args)
        pattern = self.definition['kafka_subscription'].get(
            'topic_pattern', '*')
        # only allow regex on the end of patterns
        if pattern.endswith('*'):
            topic = f'^{self.tenant}.{pattern}'
        else:
            topic = f'{self.tenant}.{pattern}'
        self.kafka_consumer.subscribe([topic], on_assign=self._on_kafka_assign)

        def _getter() -> Iterable[KafkaMessage]:
            messages = self.kafka_consumer.poll_and_deserialize(timeout=5,
                                                                num_messages=1)
            for msg in messages:
                yield KafkaMessage(msg)

        return _getter

    def __make_zeebe_getter(self):
        workflow_id = self.definition.get('zeebe_subscription')
        if not self.zeebe_connection:
            self.zeebe_connection = (None if not self.zeebe else
                                     self.zeebe.get_connection())

        def _getter() -> Iterable[ZeebeJob]:
            jobs = self.zeebe_connection.job_iterator(workflow_id,
                                                      self.kafka_group_id,
                                                      max=50)
            c = 0
            for job in jobs:
                c += 1
                yield job
            LOG.debug(f'No more jobs available on {workflow_id}, got {c}')

        return _getter

    def _get_events(self) -> Iterable[Event]:
        _source = self.__message_getter()
        yield from _source()

    def get(self) -> Iterable[PipelineContext]:
        evts: Iterable[Event] = self._get_events()
        for evt in evts:
            yield self._make_context(evt)

    def test(self, evt: Event) -> PipelineContext:
        return self._make_context(evt)

    # called when a subscription causes a new assignment to be given to the consumer
    def _on_kafka_assign(self, *args, **kwargs):
        assignment = args[1]
        topics = set([_part.topic for _part in assignment])
        for topic in list(topics):
            self._apply_consumer_filters(topic)

    def _apply_consumer_filters(self, topic):
        try:
            opts = self.definition['kafka_subscription'].get(
                'topic_options', {})
            _flt = opts.get('filter_required', False)
            if _flt:
                _filter_options = {
                    'check_condition_path': opts.get('filter_field_path', ''),
                    'pass_conditions': opts.get('filter_pass_values', []),
                    'requires_approval': _flt
                }
                self.kafka_consumer.set_topic_filter_config(
                    topic, FilterConfig(**_filter_options))
            mask_annotation = opts.get('masking_annotation', None)
            if mask_annotation:
                _mask_options = {
                    'mask_query': mask_annotation,
                    'mask_levels': opts.get('masking_levels', []),
                    'emit_level': opts.get('masking_emit_level')
                }
                self.kafka_consumer.set_topic_mask_config(
                    topic, MaskConfig(**_mask_options))
        except AttributeError as aer:
            LOG.error(f'No topic options for {self._id}| {aer}')

Exemplo n.º 7

0

Exibir arquivo

class ESJob(BaseJob):
    name = 'job'
    # Any type here needs to be registered in the API as APIServer._allowed_types
    _resources = [ESInstance, LocalESInstance, KibanaInstance, LocalKibanaInstance, Subscription]
    schema = schemas.ES_JOB

    public_actions = BaseJob.public_actions + [
        'list_topics',
        'list_subscribed_topics',
        'list_assigned_topics'
    ]
    # publicly available list of topics
    subscribed_topics: dict
    log_stack: list
    log: Callable  # each job instance has it's own log object to keep log_stacks -> user reportable

    consumer: KafkaConsumer = None
    # processing artifacts
    _indices: dict
    _schemas: dict
    _processors: dict
    _doc_types: dict
    _routes: dict
    _previous_topics: list
    _kibana: KibanaInstance
    _elasticsearch: ESInstance
    _subscriptions: List[Subscription]

    def _setup(self):
        self.subscribed_topics = {}
        self._indices = {}
        self._schemas = {}
        self._processors = {}
        self._doc_types = {}
        self._routes = {}
        self._subscriptions = []
        self._previous_topics = []
        self.group_name = f'{self.tenant}.esconsumer.{self._id}'
        self.sleep_delay: float = 0.5
        self.report_interval: int = 100
        args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
        args['group.id'] = self.group_name
        LOG.debug(args)
        self.consumer = KafkaConsumer(**args)

    def _job_elasticsearch(self, config=None) -> ESInstance:
        if config:
            es = self.get_resources('local_elasticsearch', config) + \
                self.get_resources('elasticsearch', config)
            if not es:
                raise ConsumerHttpException('No ES associated with Job', 400)
            self._elasticsearch = es[0]
        return self._elasticsearch

    def _job_kibana(self, config=None) -> KibanaInstance:
        if config:
            kibana = self.get_resources('local_kibana', config) + \
                self.get_resources('kibana', config)
            if not kibana:
                raise ConsumerHttpException('No Kibana associated with Job', 400)
            self._kibana = kibana[0]
        return self._kibana

    def _job_subscriptions(self, config=None) -> List[Subscription]:
        if config:
            subs = self.get_resources('subscription', config)
            if not subs:
                raise ConsumerHttpException('No Subscriptions associated with Job', 400)
            self._subscriptions = subs
        return self._subscriptions

    def _job_subscription_for_topic(self, topic):
        return next(iter(
            sorted([
                i for i in self._job_subscriptions()
                if i._handles_topic(topic, self.tenant)
            ])),
            None)

    def _test_connections(self, config):
        self._job_subscriptions(config)
        self._job_elasticsearch(config).test_connection()  # raises CHE
        self._job_kibana(config).test_connection()  # raises CHE
        return True

    def _get_messages(self, config):
        try:
            self.log.debug(f'{self._id} checking configurations...')
            self._test_connections(config)
            subs = self._job_subscriptions()
            self._handle_new_subscriptions(subs)
            self.log.debug(f'Job {self._id} getting messages')
            return self.consumer.poll_and_deserialize(
                timeout=5,
                num_messages=1)  # max
        except ConsumerHttpException as cer:
            # don't fetch messages if we can't post them
            self.log.debug(f'Job not ready: {cer}')
            self.status = JobStatus.RECONFIGURE
            sleep(self.sleep_delay * 10)
            return []
        except Exception as err:
            traceback_str = ''.join(traceback.format_tb(err.__traceback__))
            self.log.critical(f'unhandled error: {str(err)} | {traceback_str}')
            raise err
            sleep(self.sleep_delay)
            return []

    def _handle_new_subscriptions(self, subs):
        old_subs = list(sorted(set(self.subscribed_topics.values())))
        for sub in subs:
            pattern = sub.definition.topic_pattern
            # only allow regex on the end of patterns
            if pattern.endswith('*'):
                self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}'
            else:
                self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}'
        new_subs = list(sorted(set(self.subscribed_topics.values())))
        _diff = list(set(old_subs).symmetric_difference(set(new_subs)))
        if _diff:
            self.log.info(f'{self.tenant} added subs to topics: {_diff}')
            self.consumer.subscribe(new_subs, on_assign=self._on_assign)

    def _handle_messages(self, config, messages):
        self.log.debug(f'{self.group_name} | reading {len(messages)} messages')
        count = 0
        for msg in messages:
            topic = msg.topic
            schema = msg.schema
            if schema != self._schemas.get(topic):
                self.log.info(f'{self._id} Schema change on {topic}')
                self._update_topic(topic, schema)
                self._schemas[topic] = schema
            else:
                self.log.debug('Schema unchanged.')
            processor = self._processors[topic]
            index_name = self._indices[topic]['name']
            doc_type = self._doc_types[topic]
            route_getter = self._routes[topic]
            doc = processor.process(msg.value)
            self.submit(
                index_name,
                doc_type,
                doc,
                topic,
                route_getter,
            )
            count += 1
        self.log.info(f'processed {count} {topic} docs in tenant {self.tenant}')

    # called when a subscription causes a new assignment to be given to the consumer
    def _on_assign(self, *args, **kwargs):
        assignment = args[1]
        for _part in assignment:
            if _part.topic not in self._previous_topics:
                self.log.info(f'New topic to configure: {_part.topic}')
                self._apply_consumer_filters(_part.topic)
                self._previous_topics.append(_part.topic)

    def _apply_consumer_filters(self, topic):
        self.log.debug(f'{self._id} applying filter for new topic {topic}')
        subscription = self._job_subscription_for_topic(topic)
        if not subscription:
            self.log.error(f'Could not find subscription for topic {topic}')
            return
        try:
            opts = subscription.definition.topic_options
            _flt = opts.get('filter_required', False)
            if _flt:
                _filter_options = {
                    'check_condition_path': opts.get('filter_field_path', ''),
                    'pass_conditions': opts.get('filter_pass_values', []),
                    'requires_approval': _flt
                }
                self.log.info(_filter_options)
                self.consumer.set_topic_filter_config(
                    topic,
                    FilterConfig(**_filter_options)
                )
            mask_annotation = opts.get('masking_annotation', None)
            if mask_annotation:
                _mask_options = {
                    'mask_query': mask_annotation,
                    'mask_levels': opts.get('masking_levels', []),
                    'emit_level': opts.get('masking_emit_level')
                }
                self.log.info(_mask_options)
                self.consumer.set_topic_mask_config(
                    topic,
                    MaskConfig(**_mask_options)
                )
            self.log.info(f'Filters applied for topic {topic}')
        except AttributeError as aer:
            self.log.error(f'No topic options for {subscription.id}| {aer}')

    def _name_from_topic(self, topic):
        return topic[:].replace(f'{self.tenant}.', '', 1)

    def _update_topic(self, topic, schema: Mapping[Any, Any]):
        self.log.debug(f'{self.tenant} is updating topic: {topic}')
        subscription = self._job_subscription_for_topic(topic)
        if not subscription:
            self.log.error(f'Could not find subscription for topic {topic}')
            return
        node: Node = Node(schema)
        self.log.debug('getting index')
        es_index = index_handler.get_es_index_from_subscription(
            subscription.definition.get('es_options'),
            name=self._name_from_topic(topic),
            tenant=self.tenant.lower(),
            schema=node
        )
        self.log.debug(f'index {es_index}')
        alias_request = subscription.definition.get('es_options', {}).get('alias_name')
        if alias_request:
            alias = f'{alias_request}'.lower()
        else:
            alias = index_handler.get_alias_from_namespace(node.namespace)
        # Try to add the indices / ES alias
        es_instance = self._job_elasticsearch().get_session()
        if index_handler.es_index_changed(es_instance, es_index, self.tenant):
            self.log.debug(f'{self.tenant} updated schema for {topic}')
            self.log.debug(f'registering ES index:\n{json.dumps(es_index, indent=2)}')
            index_handler.update_es_index(
                es_instance,
                es_index,
                self.tenant,
                alias
            )
        conn: KibanaInstance = self._job_kibana()

        old_schema = self._schemas.get(topic)
        updated_kibana = index_handler.kibana_handle_schema_change(
            self.tenant.lower(),
            alias,
            old_schema,
            schema,
            subscription.definition,
            es_index,
            es_instance,
            conn
        )

        if updated_kibana:
            self.log.info(
                f'Registered kibana index {alias} for {self.tenant}'
            )
        else:
            self.log.info(
                f'Kibana index {alias} did not need update.'
            )

        self._indices[topic] = es_index
        self.log.debug(f'{self.tenant}:{topic} | idx: {es_index}')
        # update processor for type
        doc_type, instr = list(es_index['body']['mappings'].items())[0]
        self._doc_types[topic] = doc_type
        self._processors[topic] = ESItemProcessor(topic, instr, node)
        self._routes[topic] = self._processors[topic].create_route()

    def submit(self, index_name, doc_type, doc, topic, route_getter):
        es = self._job_elasticsearch().get_session()
        parent = doc.get('_parent', None)
        if parent:  # _parent field can only be in metadata apparently
            del doc['_parent']
        route = route_getter(doc)
        _id = doc.get('id')
        try:
            es.create(
                index=index_name,
                id=_id,
                routing=route,
                doc_type=doc_type,
                body=doc
            )
            self.log.debug(
                f'ES CREATE-OK [{index_name}:{self.group_name}]'
                f' -> {_id}')

        except (Exception, ESTransportError) as ese:
            self.log.debug('Could not create doc because of error: %s\nAttempting update.' % ese)
            try:
                route = self._routes[topic](doc)
                es.update(
                    index=index_name,
                    id=_id,
                    routing=route,
                    doc_type=doc_type,
                    body={'doc': doc}
                )
                self.log.debug(
                    f'ES UPDATE-OK [{index_name}:{self.group_name}]'
                    f' -> {_id}')
            except ESTransportError as ese2:
                self.log.info(
                    f'''conflict!, ignoring doc with id {_id}'''
                    f'{ese2}'
                )

    # public
    def list_topics(self, *args, **kwargs):
        '''
        Get a list of topics to which the job can subscribe.
        You can also use a wildcard at the end of names like:
        Name* which would capture both Name1 && Name2, etc
        '''
        timeout = 5
        try:
            md = self.consumer.list_topics(timeout=timeout)
        except (KafkaException) as ker:
            raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500)
        topics = [
            str(t).split(f'{self.tenant}.')[1] for t in iter(md.topics.values())
            if str(t).startswith(self.tenant)
        ]
        return topics

    # public
    def list_subscribed_topics(self, *arg, **kwargs):
        '''
        A List of topics currently subscribed to by this job
        '''
        return list(self.subscribed_topics.values())

    # public
    def list_assigned_topics(self, *arg, **kwargs):
        '''
        A List of topics currently assigned to this consumer
        '''
        return self._previous_topics[:]

Exemplo n.º 8

0

Exibir arquivo

Arquivo: artifacts.py Projeto: eHealthAfrica/aether-ckan-consumer

class CKANJob(BaseJob):
    name = 'job'
    _resources = [CKANInstance, Subscription]
    schema = schemas.CKAN_JOB

    public_actions = BaseJob.public_actions + [
        'get_logs', 'list_topics', 'list_subscribed_topics'
    ]
    # publicly available list of topics
    subscribed_topics: dict
    log_stack: list
    log: Callable  # each job instance has it's own log object to keep log_stacks -> user reportable

    consumer: KafkaConsumer = None
    # processing artifacts
    _schemas: dict
    _doc_types: dict
    _routes: dict
    _previous_topics: list
    _ckan: CKANInstance
    _subscriptions: List[Subscription]

    def _setup(self):
        self.subscribed_topics = {}
        self._schemas = {}
        self._topic_fields = {}
        self._subscriptions = []
        self._previous_topics = []
        self.log_stack = []
        self.log = callback_logger('JOB', self.log_stack, 100)
        self.group_name = f'{self.tenant}.{self._id}'
        self.sleep_delay: float = 0.5
        self.report_interval: int = 100
        args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()}
        args['group.id'] = self.group_name
        LOG.debug(args)
        self.consumer = KafkaConsumer(**args)
        self.rename_fields = {}
        self.bad_terms = []

    def _job_ckan(self, config=None) -> CKANInstance:
        if config:
            ckan = self.get_resources('ckan', config)
            if not ckan:
                raise ConsumerHttpException(
                    'No CKAN instance associated with Job', 400)
            self._ckan = ckan[0]
        return self._ckan

    def _job_subscriptions(self, config=None) -> List[Subscription]:
        if config:
            subs = self.get_resources('subscription', config)
            if not subs:
                raise ConsumerHttpException(
                    'No Subscriptions associated with Job', 400)
            self._subscriptions = subs
        return self._subscriptions

    def _job_subscription_for_topic(self, topic):
        return next(
            iter(
                sorted([
                    i for i in self._job_subscriptions()
                    if i._handles_topic(topic, self.tenant)
                ])), None)

    def _test_connections(self, config):
        self._job_subscriptions(config)
        self._job_ckan(config).test_connection()  # raises CHE
        return True

    def _get_messages(self, config):
        try:
            self.log.debug(f'{self._id} checking configurations...')
            self._test_connections(config)
            subs = self._job_subscriptions()
            self._handle_new_subscriptions(subs)
            self.log.debug(f'Job {self._id} getting messages')
            return self.consumer.poll_and_deserialize(timeout=5,
                                                      num_messages=1)  # max
        except ConsumerHttpException as cer:
            # don't fetch messages if we can't post them
            self.log.debug(f'Job not ready: {cer}')
            self.status = JobStatus.RECONFIGURE
            sleep(self.sleep_delay * 10)
            return []
        except Exception as err:
            import traceback
            traceback_str = ''.join(traceback.format_tb(err.__traceback__))
            self.log.critical(f'unhandled error: {str(err)} | {traceback_str}')
            raise err
            sleep(self.sleep_delay)
            return []

    def _handle_new_subscriptions(self, subs):
        old_subs = list(sorted(set(self.subscribed_topics.values())))
        for sub in subs:
            pattern = sub.definition.topic_pattern
            # only allow regex on the end of patterns
            if pattern.endswith('*'):
                self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}'
            else:
                self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}'
        new_subs = list(sorted(set(self.subscribed_topics.values())))
        _diff = list(set(old_subs).symmetric_difference(set(new_subs)))
        if _diff:
            self.log.info(f'{self.tenant} added subs to topics: {_diff}')
            self.consumer.subscribe(new_subs, on_assign=self._on_assign)

    def _handle_messages(self, config, messages):
        self.log.debug(f'{self.group_name} | reading {len(messages)} messages')
        ckan_instance = self._job_ckan(config=config)
        server_url = ckan_instance.definition.get('url')
        api_key = ckan_instance.definition.get('key')
        ckan_remote = RemoteCKAN(server_url, apikey=api_key)
        count = 0
        records = []
        topic = None
        for msg in messages:
            topic = msg.topic
            schema = msg.schema
            if schema != self._schemas.get(topic):
                self.log.info(f'{self._id} Schema change on {topic}')
                self._schemas[topic] = schema
                fields, definition_names = extract_fields_from_schema(schema)
                fields = prepare_fields_for_resource(fields, definition_names)
                self._topic_fields[topic] = fields
            else:
                self.log.debug('Schema unchanged.')
            records.append(msg.value)
            resource = self.submit_artefacts(topic, schema, ckan_remote)
            count += 1

        if resource:
            self._create_resource_in_datastore(resource, ckan_remote)
            self.send_data_to_datastore(self._topic_fields[topic], records,
                                        resource, ckan_remote)
        self.log.info(f'processed {count} {topic} docs')

    def submit_artefacts(self, topic, schema, ckan_remote):
        subscription = self._job_subscription_for_topic(topic)
        target_options = subscription.definition.get('target_options')
        target_dataset_metadata = CONSUMER_CONFIG.get('metadata', {})
        target_dataset_metadata.update(target_options.get('dataset_metadata'))
        dataset = self._create_dataset_in_ckan(target_dataset_metadata,
                                               ckan_remote)
        if dataset:
            resource_name = schema.get('name')
            return self._create_resource_in_ckan(resource_name, dataset,
                                                 ckan_remote)

    # called when a subscription causes a new
    # assignment to be given to the consumer
    def _on_assign(self, *args, **kwargs):
        assignment = args[1]
        for _part in assignment:
            if _part.topic not in self._previous_topics:
                self.log.info(f'New topic to configure: {_part.topic}')
                self._apply_consumer_filters(_part.topic)
                self._previous_topics.append(_part.topic)

    def _apply_consumer_filters(self, topic):
        self.log.debug(f'{self._id} applying filter for new topic {topic}')
        subscription = self._job_subscription_for_topic(topic)
        if not subscription:
            self.log.error(f'Could not find subscription for topic {topic}')
            return
        try:
            opts = subscription.definition.topic_options
            _flt = opts.get('filter_required', False)
            if _flt:
                _filter_options = {
                    'check_condition_path': opts.get('filter_field_path', ''),
                    'pass_conditions': opts.get('filter_pass_values', []),
                    'requires_approval': _flt
                }
                self.log.info(_filter_options)
                self.consumer.set_topic_filter_config(
                    topic, FilterConfig(**_filter_options))
            mask_annotation = opts.get('masking_annotation', None)
            if mask_annotation:
                _mask_options = {
                    'mask_query': mask_annotation,
                    'mask_levels': opts.get('masking_levels', []),
                    'emit_level': opts.get('masking_emit_level')
                }
                self.log.info(_mask_options)
                self.consumer.set_topic_mask_config(
                    topic, MaskConfig(**_mask_options))
            self.log.info(f'Filters applied for topic {topic}')
        except AttributeError as aer:
            self.log.error(f'No topic options for {subscription.id}| {aer}')

    def _create_dataset_in_ckan(self, dataset, ckan):
        dataset_name = dataset.get('name').lower()
        org_name = dataset.get('owner_org').lower()
        # ckan allows only lower case dataset names
        dataset.update({'name': dataset_name, 'owner_org': org_name})

        try:
            ckan.action.organization_show(id=org_name)
        except ckanapi_errors.NotFound:
            self.log.debug(f'Creating {org_name} organization')
            try:
                org = {
                    'name': org_name,
                    'state': 'active',
                }
                ckan.action.organization_create(**org)
                self.log.debug(f'Successfully created {org_name} organization')
            except ckanapi_errors.ValidationError as e:
                self.log.error(f'Cannot create organization {org_name} \
                    because of the following errors: {json.dumps(e.error_dict)}'
                               )
                return
        except ckanapi_errors.ValidationError as e:
            self.log.error(
                f'Could not find {org_name} organization. {json.dumps(e.error_dict)}'
            )
            return

        try:
            return ckan.action.package_show(id=dataset_name)
        except ckanapi_errors.NotFound:
            # Dataset does not exist, so continue with execution to create it.
            pass

        try:
            new_dataset = ckan.action.package_create(**dataset)
            self.log.debug(f'Dataset {dataset_name} created in CKAN portal.')
            return new_dataset
        except ckanapi_errors.NotAuthorized as e:
            self.log.error(f'Cannot create dataset {dataset_name}. {str(e)}')
        except ckanapi_errors.ValidationError as e:
            self.log.error(
                f'Cannot create dataset {dataset_name}. Payload is not valid. \
                    Check the following errors: {json.dumps(e.error_dict)}')

    def _create_resource_in_ckan(self, resource_name, dataset, ckan):

        try:
            resources = ckan.action.resource_search(
                query=f'name:{resource_name}')
            # todo: filter resource on dataset too
            if resources['count']:
                return resources['results'][0]
        except Exception:
            pass

        try:
            self.log.debug(f'Creating {resource_name} resource')
            resource = {
                'package_id': dataset.get('name'),
                'name': resource_name,
                'url_type': 'datastore',
            }
            new_resource = ckan.action.resource_create(**resource)
            self.log.debug(f'Successfully created {resource_name} resource')
            return new_resource
        except ckanapi_errors.NotAuthorized as e:
            self.log.error(f'Cannot create resource {resource_name}. {str(e)}')
        except ckanapi_errors.ValidationError as e:
            self.log.error(
                f'Cannot create resource {resource_name}. Payload is not valid. \
                    Check the following errors: {json.dumps(e.error_dict)}')

    def _create_resource_in_datastore(self, resource, ckan):
        payload = {
            'resource_id': resource.get('id'),
        }

        try:
            ckan.action.datastore_create(**payload)
        except ckanapi_errors.CKANAPIError as e:
            self.log.error(f'An error occurred while creating resource \
                {resource.get("name")} in Datastore. {str(e)}')

    def send_data_to_datastore(self, fields, records, resource, ckan):
        resource_id = resource.get('id')
        resource_name = resource.get('name')
        payload = {
            'id': resource_id,
            'limit': 1,
        }

        try:
            response = ckan.action.datastore_search(**payload)
        except ckanapi_errors.CKANAPIError as e:
            self.log.error(
                f'An error occurred while getting Datastore fields for resource \
                {resource_id}. {str(e)}')
            return

        new_fields = response.get('fields')
        new_fields[:] = [
            field for field in new_fields if field.get('id') != '_id'
        ]

        schema_changes = self.get_schema_changes(new_fields, fields)

        if len(new_fields) == 0 or len(schema_changes) > 0:
            self.log.info('Datastore detected schema changes')
            for new_field in schema_changes:
                new_fields.append(new_field)

            payload = {
                'resource_id': resource_id,
                'fields': new_fields,
            }

            try:
                ckan.action.datastore_create(**payload)
            except ckanapi_errors.CKANAPIError as cke:
                self.log.error(
                    f'An error occurred while adding new fields for resource \
                    {resource_name} in Datastore.')
                label = str(cke)
                self.log.error('ResourceType: {0} Error: {1}'.format(
                    resource_name, label))
                bad_fields = literal_eval(label).get('fields', None)
                if not isinstance(bad_fields, list):
                    raise ValueError('Bad field could not be identified.')
                issue = bad_fields[0]
                bad_term = str(issue.split(' ')[0]).strip("'").strip('"')
                self.bad_terms.append(bad_term)
                self.log.info('Recovery from error: bad field name %s' %
                              bad_term)
                self.log.info('Reverting %s' % (schema_changes, ))
                for new_field in schema_changes:
                    new_fields.remove(new_field)
                return self.send_data_to_datastore(fields, records, resource,
                                                   ckan)

        records = self.convert_item_to_array(records, new_fields)

        payload = {
            'resource_id': resource_id,
            'method': 'insert',
            'records': records,
        }

        try:
            ckan.action.datastore_upsert(**payload)
            self.log.info(f'Updated resource {resource_id} in {ckan.address}.')
        except ckanapi_errors.CKANAPIError as cke:
            self.log.error(
                f'An error occurred while inserting data into resource {resource_name}'
            )
            self.log.error(f'ResourceType: {resource} Error: {str(cke)}')

    def get_schema_changes(self, schema, fields):
        ''' Only check if new field has been added. '''

        new_fields = []

        for field in fields:
            field_found = False

            for schema_field in schema:
                if field.get('id') == schema_field.get('id'):
                    field_found = True
                    break

            if not field_found:
                if field.get('id') in self.bad_terms:
                    new_fields.append(self.rename_field(field))
                else:
                    new_fields.append(field)

        return new_fields

    def rename_field(self, field):
        bad_name = field.get('id')
        new_name = 'ae' + bad_name
        self.rename_fields[bad_name] = new_name
        field['id'] = new_name
        return field

    def convert_item_to_array(self, records, new_fields):
        ''' If a field is of type array, and the value for it contains a
        primitive type, then convert it to an array of that primitive type.

        This mutation is required for all records, otherwise CKAN will raise
        an exception.

        Example:
            For given field which is of type array of integers
            {'type': '_int', 'id': 'scores'}
            Original record {'scores': 10}
            Changed record {'scores': [10]}
        '''

        array_fields = []
        records = records[:]

        for field in new_fields:
            if field.get('type').startswith('_'):
                array_fields.append(field.get('id'))

        for record in records:
            for key, value in record.items():
                if self.bad_terms:
                    name = self.rename_fields.get(key, key)
                    if name != key:
                        del record[key]
                else:
                    name = key
                if key in array_fields:
                    record[name] = [value]
                else:
                    record[name] = value

        return records

    # public
    def list_topics(self, *args, **kwargs):
        '''
        Get a list of topics to which the job can subscribe.
        You can also use a wildcard at the end of names like:
        Name* which would capture both Name1 && Name2, etc
        '''
        timeout = 5
        try:
            md = self.consumer.list_topics(timeout=timeout)
        except (KafkaException) as ker:
            raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500)
        topics = [
            str(t).split(f'{self.tenant}.')[1]
            for t in iter(md.topics.values()) if str(t).startswith(self.tenant)
        ]
        return topics

    # public
    def list_subscribed_topics(self, *arg, **kwargs):
        '''
        A List of topics currently subscribed to by this job
        '''
        return list(self.subscribed_topics.values())

    # public
    def get_logs(self, *arg, **kwargs):
        '''
        A list of the last 100 log entries from this job in format
        [
            (timestamp, log_level, message),
            (timestamp, log_level, message),
            ...
        ]
        '''
        return self.log_stack[:]