def __init__( self, producer_name, team_name, expected_frequency_seconds, use_work_pool=False, dry_run=False, position_data_callback=None, monitoring_enabled=True, schema_id_list=None ): super(Producer, self).__init__( producer_name, team_name, expected_frequency_seconds, monitoring_enabled, dry_run=dry_run ) self.use_work_pool = use_work_pool self.dry_run = dry_run self.position_data_callback = position_data_callback if schema_id_list is None: schema_id_list = [] # Send initial producer registration messages self.registrar.register_tracked_schema_ids(schema_id_list) self.enable_meteorite = get_config().enable_meteorite self.enable_sensu = get_config().enable_sensu self.monitors = {} self._next_sensu_update = 0 self._sensu_window = 0 self._setup_monitors()
def __init__( self, schema_ref, file_paths, override_metadata, file_extension=None ): """ Args: schema_ref(SchemaRef): SchemaRef to use for looking up metadata file_paths(set([str])): A list of file paths to use for bootstrapping override_metadata(boolean): If True then existing metadata (such as notes, categories, etc) will be overwritten with provided schema_ref, otherwise existing metadata will be preserved. file_extension(str): Must be specified by subclasses, this should be string specifying the file extension the subclass operates on, for example 'sql' or 'avsc' """ self.api = get_config().schematizer_client self.log = get_config().logger self.schema_ref = schema_ref self.override_metadata = override_metadata self.file_extension = file_extension self.file_paths = set([ file_path for file_path in file_paths if self.is_correct_file_extension(file_path) ])
def __init__(self): # Store these on the class since they should only ever be called once if _LibUUID._ffi is None or _LibUUID._libuuid is None: _LibUUID._ffi = FFI() # These definitions are from uuid.h _LibUUID._ffi.cdef(""" typedef unsigned char uuid_t[16]; void uuid_generate(uuid_t out); void uuid_generate_random(uuid_t out); void uuid_generate_time(uuid_t out); """) # By opening the library with dlopen, the compile step is skipped # dodging a class of errors, since headers aren't needed, just the # installed library. _LibUUID._libuuid = _LibUUID._ffi.dlopen( ctypes.util.find_library("uuid") ) get_config().logger.debug( "FastUUID Created - FFI: ({}), LIBUUID: ({})".format( _LibUUID._ffi, _LibUUID._libuuid ) ) # Keeping only one copy of this around does result in # pretty substantial performance improvements - in the 10,000s of # messages per second range self.output = _LibUUID._ffi.new("uuid_t")
def __init__( self, producer_name, team_name, expected_frequency_seconds, use_work_pool=False, dry_run=False, position_data_callback=None, monitoring_enabled=True, schema_id_list=None ): super(Producer, self).__init__( producer_name, team_name, expected_frequency_seconds, monitoring_enabled, dry_run=dry_run ) self.use_work_pool = use_work_pool self.dry_run = dry_run self.position_data_callback = position_data_callback if schema_id_list is None: schema_id_list = [] # Send initial producer registration messages self.registrar.register_tracked_schema_ids(schema_id_list) self.enable_meteorite = get_config().enable_meteorite self.enable_sensu = get_config().enable_sensu self.monitors = {} self._next_sensu_update = 0 self._sensu_window = 0 self._setup_monitors()
def __init__( self, consumer_name, team_name, expected_frequency_seconds, topic_to_consumer_topic_state_map=None, consumer_source=None, force_payload_decode=True, auto_offset_reset='smallest', partitioner_cooldown=get_config().consumer_partitioner_cooldown_default, use_group_sha=get_config().consumer_use_group_sha_default, topic_refresh_frequency_seconds=get_config().topic_refresh_frequency_seconds, pre_rebalance_callback=None, post_rebalance_callback=None, fetch_offsets_for_topics=None, pre_topic_refresh_callback=None, cluster_name=None ): super(BaseConsumer, self).__init__( consumer_name, team_name, expected_frequency_seconds, monitoring_enabled=False ) if ((topic_to_consumer_topic_state_map and consumer_source) or (not topic_to_consumer_topic_state_map and not consumer_source)): raise ValueError("Exactly one of topic_to_consumer_topic_state_map " "or consumer_source must be specified") self.consumer_source = consumer_source self.topic_to_consumer_topic_state_map = topic_to_consumer_topic_state_map self.force_payload_decode = force_payload_decode self.auto_offset_reset = auto_offset_reset self.partitioner_cooldown = partitioner_cooldown self.use_group_sha = use_group_sha self.running = False self.consumer_group = None self.pre_rebalance_callback = pre_rebalance_callback self.post_rebalance_callback = post_rebalance_callback self.fetch_offsets_for_topics = fetch_offsets_for_topics self.pre_topic_refresh_callback = pre_topic_refresh_callback self.cluster_name = self._set_cluster_name(cluster_name) self._refresh_timer = _ConsumerTick( refresh_time_seconds=topic_refresh_frequency_seconds ) self._topic_to_reader_schema_map = self._get_topic_to_reader_schema_map( consumer_source ) self._consumer_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().consumer_max_offset_retry_count ) self._envelope = Envelope() if self.topic_to_consumer_topic_state_map: self.cluster_type = self._determine_cluster_type_from_topics( self.topic_to_consumer_topic_state_map.keys() )
def __init__(self): for avail_uuid in self._avail_uuids: try: self._uuid_in_use = avail_uuid() break except Exception: get_config().logger.error( "libuuid is unavailable, falling back to the slower built-in " "uuid implementation. On ubuntu, apt-get install uuid-dev." )
def _setup_monitors(self): """This method sets up the meteorite monitor as well as the two sensu monitors, first for ttl, and second for delay. The ttl monitor tracks the health of the producer and upstream heartbeat. The delay monitor tracks whether the producer has fallen too far behind the upstream data""" try: from data_pipeline.tools.meteorite_wrappers import StatsCounter from data_pipeline.tools.sensu_alert_manager import SensuAlertManager from data_pipeline.tools.sensu_ttl_alerter import SensuTTLAlerter except ImportError: self.enable_meteorite = False self.enable_sensu = False return self.monitors["meteorite"] = StatsCounter( stat_counter_name=self.client_name, container_name=get_config().container_name, container_env=get_config().container_env ) underscored_client_name = "_".join(self.client_name.split()) # Sensu event dictionary parameters are described here: # http://pysensu-yelp.readthedocs.io/en/latest/index.html?highlight=send_event ttl_sensu_dict = { 'name': "{0}_outage_check".format(underscored_client_name), 'output': "{0} is back on track".format(self.client_name), 'runbook': "y/datapipeline", 'team': self.registrar.team_name, 'page': get_config().sensu_page_on_critical, 'status': 0, 'ttl': "{0}s".format(get_config().sensu_ttl), 'sensu_host': get_config().sensu_host, 'source': "{0}_{1}".format( self.client_name, get_config().sensu_source ), 'tip': "either the producer has died or there are no hearbeats upstream" } self._sensu_window = get_config().sensu_ping_window_seconds self.monitors["sensu_ttl"] = SensuTTLAlerter( sensu_event_info=ttl_sensu_dict, enable=self.enable_sensu ) delay_sensu_dict = copy.deepcopy(ttl_sensu_dict) delay_sensu_dict.update({ 'name': "{0}_delay_check".format(underscored_client_name), 'alert_after': get_config().sensu_alert_after_seconds, }) disable_sensu = not self.enable_sensu SENSU_DELAY_ALERT_INTERVAL_SECONDS = 30 self.monitors["sensu_delay"] = SensuAlertManager( SENSU_DELAY_ALERT_INTERVAL_SECONDS, self.client_name, delay_sensu_dict, get_config().max_producer_delay_seconds, disable=disable_sensu )
def _setup_monitors(self): """This method sets up the meteorite monitor as well as the two sensu monitors, first for ttl, and second for delay. The ttl monitor tracks the health of the producer and upstream heartbeat. The delay monitor tracks whether the producer has fallen too far behind the upstream data""" try: from data_pipeline.tools.meteorite_wrappers import StatsCounter from data_pipeline.tools.sensu_alert_manager import SensuAlertManager from data_pipeline.tools.sensu_ttl_alerter import SensuTTLAlerter except ImportError: self.enable_meteorite = False self.enable_sensu = False return self.monitors["meteorite"] = StatsCounter( stat_counter_name=self.client_name, container_name=get_config().container_name, container_env=get_config().container_env ) underscored_client_name = "_".join(self.client_name.split()) # Sensu event dictionary parameters are described here: # http://pysensu-yelp.readthedocs.io/en/latest/index.html?highlight=send_event ttl_sensu_dict = { 'name': "{0}_outage_check".format(underscored_client_name), 'output': "{0} is back on track".format(self.client_name), 'runbook': "y/datapipeline", 'team': self.registrar.team_name, 'page': get_config().sensu_page_on_critical, 'status': 0, 'ttl': "{0}s".format(get_config().sensu_ttl), 'sensu_host': get_config().sensu_host, 'source': "{0}_{1}".format( self.client_name, get_config().sensu_source ), 'tip': "either the producer has died or there are no hearbeats upstream" } self._sensu_window = get_config().sensu_ping_window_seconds self.monitors["sensu_ttl"] = SensuTTLAlerter( sensu_event_info=ttl_sensu_dict, enable=self.enable_sensu ) delay_sensu_dict = copy.deepcopy(ttl_sensu_dict) delay_sensu_dict.update({ 'name': "{0}_delay_check".format(underscored_client_name), 'alert_after': get_config().sensu_alert_after_seconds, }) disable_sensu = not self.enable_sensu SENSU_DELAY_ALERT_INTERVAL_SECONDS = 30 self.monitors["sensu_delay"] = SensuAlertManager( SENSU_DELAY_ALERT_INTERVAL_SECONDS, self.client_name, delay_sensu_dict, get_config().max_producer_delay_seconds, disable=disable_sensu )
def __init__(self, consumer_name, team_name, expected_frequency_seconds, topic_to_consumer_topic_state_map=None, consumer_source=None, force_payload_decode=True, auto_offset_reset='smallest', partitioner_cooldown=get_config( ).consumer_partitioner_cooldown_default, use_group_sha=get_config().consumer_use_group_sha_default, topic_refresh_frequency_seconds=get_config(). topic_refresh_frequency_seconds, pre_rebalance_callback=None, post_rebalance_callback=None, fetch_offsets_for_topics=None, pre_topic_refresh_callback=None, cluster_name=None): super(BaseConsumer, self).__init__(consumer_name, team_name, expected_frequency_seconds, monitoring_enabled=False) if ((topic_to_consumer_topic_state_map and consumer_source) or (not topic_to_consumer_topic_state_map and not consumer_source)): raise ValueError( "Exactly one of topic_to_consumer_topic_state_map " "or consumer_source must be specified") self.consumer_source = consumer_source self.topic_to_consumer_topic_state_map = topic_to_consumer_topic_state_map self.force_payload_decode = force_payload_decode self.auto_offset_reset = auto_offset_reset self.partitioner_cooldown = partitioner_cooldown self.use_group_sha = use_group_sha self.running = False self.consumer_group = None self.pre_rebalance_callback = pre_rebalance_callback self.post_rebalance_callback = post_rebalance_callback self.fetch_offsets_for_topics = fetch_offsets_for_topics self.pre_topic_refresh_callback = pre_topic_refresh_callback self.cluster_name = self._set_cluster_name(cluster_name) self._refresh_timer = _ConsumerTick( refresh_time_seconds=topic_refresh_frequency_seconds) self._topic_to_reader_schema_map = self._get_topic_to_reader_schema_map( consumer_source) self._consumer_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().consumer_max_offset_retry_count) self._envelope = Envelope() if self.topic_to_consumer_topic_state_map: self.cluster_type = self._determine_cluster_type_from_topics( self.topic_to_consumer_topic_state_map.keys())
def __init__(self, producer_position_callback, dry_run=False): self.producer_position_callback = producer_position_callback self.dry_run = dry_run self.kafka_client = KafkaClient(get_config().cluster_config.broker_list) self.position_data_tracker = PositionDataTracker() self._reset_message_buffer() self.skip_messages_with_pii = get_config().skip_messages_with_pii self._publish_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().producer_max_publish_retry_count ) self._automatic_flush_enabled = True
def __init__(self, producer_position_callback, dry_run=False): self.producer_position_callback = producer_position_callback self.dry_run = dry_run self.kafka_client = KafkaClient(get_config().cluster_config.broker_list) self.position_data_tracker = PositionDataTracker() self._reset_message_buffer() self.skip_messages_with_pii = get_config().skip_messages_with_pii self._publish_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().producer_max_publish_retry_count ) self._automatic_flush_enabled = True
def get_message( self, blocking=False, timeout=get_config().consumer_get_messages_timeout_default ): """ Retrieve a single message. Returns None if no message could be retrieved within the timeout. Warning: If `blocking` is True and `timeout` is None this will block until a message is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: (Optional(data_pipeline.message.Message)): Message object or None if no message could be retrieved. """ return next(iter( self.get_messages( count=1, blocking=blocking, timeout=timeout )), None )
def setup(self, containers): self.kafka_client = containers.get_kafka_connection() self.cluster_config = get_config().cluster_config self.producer = YelpKafkaSimpleProducer( client=self.kafka_client, cluster_config=self.cluster_config )
def test_base_consumer_without_cluster_name( self, topic, consumer_init_kwargs ): with mock.patch( 'yelp_kafka.discovery.get_kafka_cluster' ) as mock_get_kafka_cluster, mock.patch( 'kafka_utils.util.config.ClusterConfig.__init__', return_value=None ) as mock_cluster_config_init: consumer = BaseConsumer( topic_to_consumer_topic_state_map={topic: None}, auto_offset_reset='largest', **consumer_init_kwargs ) consumer._region_cluster_config assert mock_get_kafka_cluster.call_count == 0 config = get_config() mock_cluster_config_init.assert_called_once_with( type='standard', name='data_pipeline', broker_list=config.kafka_broker_list, zookeeper=config.kafka_zookeeper )
def apply_log_compaction(self, topics): self.log.info("Applying compaction settings on {} topics".format( len(topics))) compacted_topics = [] skipped_topics = [] missed_topics = [] cluster = get_config().cluster_config with ZK(cluster) as zk: for topic in topics: try: current_config = zk.get_topic_config(topic) if 'cleanup.policy' not in current_config['config']: # if we already have the config set or there was a # manual override we don't want to set again current_config['config']['cleanup.policy'] = 'compact' if not self.dry_run: zk.set_topic_config(topic=topic, value=current_config) compacted_topics.append(topic) else: skipped_topics.append(topic) except NoNodeError: missed_topics.append(topic) self.log_results(compacted_topics=compacted_topics, skipped_topics=skipped_topics, missed_topics=missed_topics)
def _configure_tools(self): load_default_config( self.options.config_file, self.options.env_config_file ) # We setup logging 'early' since we want it available for setup_topics self._setup_logging() self.kafka_client = KafkaClient(get_config().cluster_config.broker_list) self._setup_topics() if len(self.topic_to_offsets_map) == 0: self.option_parser.error("At least one topic must be specified.") if self.options.start_timestamp is not None and self.options.start_timestamp >= int(time.time()): self.option_parser.error("--start-timestamp should not be later than current time") if self.options.start_timestamp is not None and self.options.end_timestamp and ( self.options.start_timestamp > self.options.end_timestamp ): self.option_parser.error("--end-timestamp must not be smaller than --start-timestamp") if self.options.all_fields: self.options.fields = self._public_message_field_names self._verify_offset_ranges()
def get_messages( self, count, blocking=False, timeout=get_config().consumer_get_messages_timeout_default ): """ Retrieve a list of messages from the message buffer, optionally blocking until the requested number of messages has been retrieved. Note: The derived class must implement this method. Warning: If `blocking` is True and `timeout` is None this will block until the requested number of messages is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: count (int): Number of messages to retrieve blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: ([data_pipeline.message.Message]): List of Message objects with a of maximum size `count`, but may be smaller or empty depending on how many messages were retrieved within the timeout. """ raise NotImplementedError
def __init__(self, encryption_type, encryption_meta=None): key_location = get_config().key_location + 'key-{}.key' self.key = self._retrieve_key(encryption_type, key_location) self.encryption_meta = ( encryption_meta or self.get_encryption_meta_by_encryption_type(encryption_type) )
def __init__(self, log_name): self.log_name = log_name load_package_config('/nail/srv/configs/data_pipeline_tools.yaml') self.config = get_config() self.log = logging.getLogger(self.log_name) self._setup_logging() self.schematizer = get_schematizer()
def create_kafka_topic(self, topic): """This method execs in the docker container because it's the only way to control how the topic is created. Args: topic (str): Topic name to create """ conn = Containers.get_kafka_connection() if conn.has_metadata_for_topic(topic): return logger.info("Creating Fake Topic") if not isinstance(topic, str): raise ValueError("topic must be a str, it cannot be unicode") kafka_create_topic_command = ( "$KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper zk:2181 " "--replication-factor 1 --partition 1 --topic {topic}" ).format(topic=topic) Containers.exec_command(kafka_create_topic_command, self.project, 'kafka') logger.info("Waiting for topic") conn.ensure_topic_exists( topic, timeout=get_config().topic_creation_wait_timeout ) conn.close() logger.info("Topic Exists") assert conn.has_metadata_for_topic(topic)
def get_messages( self, count, blocking=False, timeout=get_config().consumer_get_messages_timeout_default): """ Retrieve a list of messages from the message buffer, optionally blocking until the requested number of messages has been retrieved. Note: The derived class must implement this method. Warning: If `blocking` is True and `timeout` is None this will block until the requested number of messages is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: count (int): Number of messages to retrieve blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: ([data_pipeline.message.Message]): List of Message objects with a of maximum size `count`, but may be smaller or empty depending on how many messages were retrieved within the timeout. """ raise NotImplementedError
def schematizer(): schematizer = get_schematizer() # schematizer is a Singleton. Rerun the ctor of Schematizer per module. schematizer._client = get_config().schematizer_client # swaggerpy client schematizer._cache = _Cache() schematizer._avro_schema_cache = {} return schematizer
def schematizer(): schematizer = get_schematizer() # schematizer is a Singleton. Rerun the ctor of Schematizer per module. schematizer._client = get_config().schematizer_client # swaggerpy client schematizer._cache = _Cache() schematizer._avro_schema_cache = {} return schematizer
def __init__(self): super(BaseParseReplicationStream, self).__init__() self.db_connections = get_connection( config.env_config.topology_path, config.env_config.rbr_source_cluster, config.env_config.schema_tracker_cluster, config.env_config.rbr_state_cluster, config.env_config.rbr_source_cluster_topology_name, ) self.schema_wrapper = SchemaWrapper( db_connections=self.db_connections, schematizer_client=get_schematizer()) self.register_dry_run = config.env_config.register_dry_run self.publish_dry_run = config.env_config.publish_dry_run self._running = True self._profiler_running = False self._changelog_mode = config.env_config.changelog_mode if get_config( ).kafka_producer_buffer_size > config.env_config.recovery_queue_size: # Printing here, since this executes *before* logging is # configured. sys.stderr.write( "Shutting down because kafka_producer_buffer_size was greater than \ recovery_queue_size") sys.exit(1)
def PositionDataTracker(): """Factory method for generating PositionDataTracker or subclasses """ if get_config().merge_position_info_update: return _MergingPositionDataTracker() else: return _PositionDataTracker()
def create_kafka_topic(self, topic): """This method execs in the docker container because it's the only way to control how the topic is created. Args: topic (str): Topic name to create """ conn = Containers.get_kafka_connection() if conn.has_metadata_for_topic(topic): return logger.info("Creating Fake Topic") if not isinstance(topic, str): raise ValueError("topic must be a str, it cannot be unicode") kafka_create_topic_command = ( "$KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper zk:2181 " "--replication-factor 1 --partition 1 --topic {topic}").format( topic=topic) Containers.exec_command(kafka_create_topic_command, self.project, 'kafka') logger.info("Waiting for topic") conn.ensure_topic_exists( topic, timeout=get_config().topic_creation_wait_timeout) conn.close() logger.info("Topic Exists") assert conn.has_metadata_for_topic(topic)
def __init__(self, log_name): self.log_name = log_name load_package_config('/nail/srv/configs/data_pipeline_tools.yaml') self.config = get_config() self.log = logging.getLogger(self.log_name) self._setup_logging() self.schematizer = get_schematizer()
def __init__(self): super(BaseParseReplicationStream, self).__init__() self.db_connections = get_connection( config.env_config.topology_path, config.env_config.rbr_source_cluster, config.env_config.schema_tracker_cluster, config.env_config.rbr_state_cluster, is_avoid_internal_packages_set(), config.env_config.rbr_source_cluster_topology_name, ) self.schema_wrapper = SchemaWrapper( db_connections=self.db_connections, schematizer_client=get_schematizer() ) self.register_dry_run = config.env_config.register_dry_run self.publish_dry_run = config.env_config.publish_dry_run self._running = True self._profiler_running = False self._changelog_mode = config.env_config.changelog_mode if get_config().kafka_producer_buffer_size > config.env_config.recovery_queue_size: # Printing here, since this executes *before* logging is # configured. sys.stderr.write("Shutting down because kafka_producer_buffer_size was greater than \ recovery_queue_size") sys.exit(1)
def get_message( self, blocking=False, timeout=get_config().consumer_get_messages_timeout_default): """ Retrieve a single message. Returns None if no message could be retrieved within the timeout. Warning: If `blocking` is True and `timeout` is None this will block until a message is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: (Optional(data_pipeline.message.Message)): Message object or None if no message could be retrieved. """ return next( iter(self.get_messages(count=1, blocking=blocking, timeout=timeout)), None)
def _region_cluster_config(self): """ The ClusterConfig for Kafka cluster to connect to. If cluster_name is not specified, it will default to the value set in Config""" if self.cluster_name: return discovery.get_kafka_cluster(cluster_type=self.cluster_type, client_id=self.client_name, cluster_name=self.cluster_name) else: return get_config().cluster_config
def log_result_urls(self, schema_results): self.log.info("Completed updating the following tables:") for schema_result in schema_results: self.log.info( '{host}/web/#/table?schema={namespace}&table={source}'.format( host='{}:{}'.format(self.options.http_host, get_config().schematizer_port), namespace=schema_result.topic.source.namespace.name, source=schema_result.topic.source.name))
def test_get_log_message(self, log_consumer_instance, publish_log_messages, log_message, log_topic): with mock.patch('yelp_kafka.discovery.get_region_cluster', return_value=get_config().cluster_config): with log_consumer_instance as consumer: publish_log_messages(log_topic, log_message, count=1) asserter = ConsumerAsserter(consumer=consumer, expected_message=log_message) _message = consumer.get_message(blocking=True, timeout=TIMEOUT) asserter.assert_messages([_message], expected_count=1)
def _set_encryption_type_if_necessary(self): if self._encryption_type or not self._should_be_encrypted: return config_encryption_type = get_config().encryption_type if config_encryption_type is None: raise ValueError( "Encryption type must be set when message requires to be encrypted." ) self._encryption_type = config_encryption_type self._encryption_helper = EncryptionHelper(config_encryption_type) self._set_encryption_meta()
def _region_cluster_config(self): """ The ClusterConfig for Kafka cluster to connect to. If cluster_name is not specified, it will default to the value set in Config""" if self.cluster_name: return discovery.get_kafka_cluster( cluster_type=self.cluster_type, client_id=self.client_name, cluster_name=self.cluster_name ) else: return get_config().cluster_config
def config(cls): """Loads and decodes the :attr:`data_pipeline.config.Config.data_pipeline_teams_config_file_path`. TODO(justinc|DATAPIPE-348): Cache team config, dealing with invalidation when configuration changes. Returns: dict: team configuration """ config_path = get_config().data_pipeline_teams_config_file_path return yaml.load(open(config_path).read())
def config(cls): """Loads and decodes the :attr:`data_pipeline.config.Config.data_pipeline_teams_config_file_path`. TODO(justinc|DATAPIPE-348): Cache team config, dealing with invalidation when configuration changes. Returns: dict: team configuration """ config_path = get_config().data_pipeline_teams_config_file_path return yaml.load(open(config_path).read())
def setup_capture_new_messages_consumer(topic): """Seeks to the tail of the topic then returns a function that can consume messages from that point. """ kafka = KafkaClient(get_config().cluster_config.broker_list) group = str('data_pipeline_clientlib_test') consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE) consumer.seek(0, 2) # seek to tail, 0 is the offset, and 2 is the tail yield consumer kafka.close()
def _region_cluster_config(self): """ The ClusterConfig for Kafka cluster to connect to. If cluster_name is not specified, it will default to the value set in Config""" # TODO [askatti#DATAPIPE-2137|2016-11-28] Use discovery methods after # adding kafkadiscovery container to make tests work # if self.cluster_name: # return discovery.get_kafka_cluster( # cluster_type=self.cluster_type, # client_id=self.client_name, # cluster_name=self.cluster_name # ) # else: # return get_config().cluster_config return get_config().cluster_config
def setup_capture_new_messages_consumer(topic): """Seeks to the tail of the topic then returns a function that can consume messages from that point. """ kafka = KafkaClient(get_config().cluster_config.broker_list) group = str('data_pipeline_clientlib_test') consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE) consumer.seek(0, 2) # seek to tail, 0 is the offset, and 2 is the tail yield consumer kafka.close()
def _try_send_produce_requests(self, requests): # Either it throws exceptions and none of them succeeds, or it returns # responses of all the requests (success or fail response). try: return self.kafka_client.send_produce_request( payloads=requests, acks=get_config().kafka_client_ack_count, fail_on_error=False) except Exception: # Exceptions like KafkaUnavailableError, LeaderNotAvailableError, # UnknownTopicOrPartitionError, etc., are not controlled by # `fail_on_error` flag and could be thrown from the kafka client, # and fail all the requests. We will retry all the requests until # either all of them are successfully published or it exceeds the # maximum retry criteria. return []
def get_kafka_connection(cls, timeout_seconds=15): """Returns a kafka connection, waiting timeout_seconds for the container to come up. Args: timeout_seconds: Retry time (seconds) to get a kafka connection """ end_time = time.time() + timeout_seconds logger.info("Getting connection to Kafka container on yocalhost") while end_time > time.time(): try: return KafkaClient(get_config().cluster_config.broker_list) except KafkaUnavailableError: logger.info("Kafka not yet available, waiting...") time.sleep(0.1) raise KafkaUnavailableError()
def get_kafka_connection(cls, timeout_seconds=15): """Returns a kafka connection, waiting timeout_seconds for the container to come up. Args: timeout_seconds: Retry time (seconds) to get a kafka connection """ end_time = time.time() + timeout_seconds logger.info("Getting connection to Kafka container on yocalhost") while end_time > time.time(): try: return KafkaClient(get_config().cluster_config.broker_list) except KafkaUnavailableError: logger.info("Kafka not yet available, waiting...") time.sleep(0.1) raise KafkaUnavailableError()
def _try_send_produce_requests(self, requests): # Either it throws exceptions and none of them succeeds, or it returns # responses of all the requests (success or fail response). try: return self.kafka_client.send_produce_request( payloads=requests, acks=get_config().kafka_client_ack_count, fail_on_error=False ) except Exception: # Exceptions like KafkaUnavailableError, LeaderNotAvailableError, # UnknownTopicOrPartitionError, etc., are not controlled by # `fail_on_error` flag and could be thrown from the kafka client, # and fail all the requests. We will retry all the requests until # either all of them are successfully published or it exceeds the # maximum retry criteria. return []
def test_base_consumer_without_cluster_name(self, topic, consumer_init_kwargs): with mock.patch('yelp_kafka.discovery.get_kafka_cluster' ) as mock_get_kafka_cluster, mock.patch( 'kafka_utils.util.config.ClusterConfig.__init__', return_value=None) as mock_cluster_config_init: consumer = BaseConsumer( topic_to_consumer_topic_state_map={topic: None}, auto_offset_reset='largest', **consumer_init_kwargs) consumer._region_cluster_config assert mock_get_kafka_cluster.call_count == 0 config = get_config() mock_cluster_config_init.assert_called_once_with( type='standard', name='data_pipeline', broker_list=config.kafka_broker_list, zookeeper=config.kafka_zookeeper)
def test_get_log_message( self, log_consumer_instance, publish_log_messages, log_message, log_topic ): with mock.patch( 'yelp_kafka.discovery.get_region_cluster', return_value=get_config().cluster_config ): with log_consumer_instance as consumer: publish_log_messages(log_topic, log_message, count=1) asserter = ConsumerAsserter( consumer=consumer, expected_message=log_message ) _message = consumer.get_message(blocking=True, timeout=TIMEOUT) asserter.assert_messages([_message], expected_count=1)
def __init__(self, client_name, client_type, start_time=0, monitoring_enabled=True, dry_run=False): self.client_name = client_name self.client_type = client_type self.monitoring_enabled = monitoring_enabled if not self.monitoring_enabled: return self.topic_to_tracking_info_map = {} self._monitoring_window_in_sec = get_config().monitoring_window_in_sec self.start_time = start_time self.producer = LoggingKafkaProducer(self._notify_messages_published, dry_run=dry_run) self.dry_run = dry_run self._last_msg_timestamp = None
def _wait_for_schematizer(self, timeout_seconds): # wait for schematizer to pass health check end_time = time.time() + timeout_seconds logger.info("Waiting for schematizer to pass health check") count = 0 while end_time > time.time(): time.sleep(0.1) try: r = requests.get( "http://{0}/v1/namespaces".format(get_config().schematizer_host_and_port) ) if 200 <= r.status_code < 300: count += 1 if count >= 2: return except Exception: count = 0 finally: logger.info("Schematizer not yet available, waiting...") raise ContainerUnavailableError(project='schematizer', service='schematizer')
def _wait_for_schematizer(self, timeout_seconds): # wait for schematizer to pass health check end_time = time.time() + timeout_seconds logger.info("Waiting for schematizer to pass health check") count = 0 while end_time > time.time(): time.sleep(0.1) try: r = requests.get("http://{0}/v1/namespaces".format( get_config().schematizer_host_and_port)) if 200 <= r.status_code < 300: count += 1 if count >= 2: return except Exception: count = 0 finally: logger.info("Schematizer not yet available, waiting...") raise ContainerUnavailableError(project='schematizer', service='schematizer')
def config(self): return get_config()
def debug_log(line_lambda, exc_info=None): """This avoids unnecessary formatting of debug log string. More info in DATAPIPE-979 """ if get_config().logger.isEnabledFor(logging.DEBUG): get_config().logger.debug(line_lambda(), exc_info=exc_info)
def _is_ready_to_flush(self): time_limit = get_config().kafka_producer_flush_time_limit_seconds return (self._automatic_flush_enabled and ( (time.time() - self.start_time) >= time_limit or self.message_buffer_size >= get_config().kafka_producer_buffer_size ))
def log_command(): get_config().logger.debug("Message buffered: {}".format(repr(message)))
from kafka import KafkaClient from kafka.common import ProduceRequest from data_pipeline._position_data_tracker import PositionDataTracker from data_pipeline._producer_retry import RetryHandler from data_pipeline._retry_util import ExpBackoffPolicy from data_pipeline._retry_util import MaxRetryError from data_pipeline._retry_util import Predicate from data_pipeline._retry_util import retry_on_condition from data_pipeline._retry_util import RetryPolicy from data_pipeline.config import get_config from data_pipeline.envelope import Envelope _EnvelopeAndMessage = namedtuple("_EnvelopeAndMessage", ["envelope", "message"]) logger = get_config().logger # prepare needs to be in the module top level so it can be serialized for # multiprocessing def _prepare(envelope_and_message): try: kwargs = {} if envelope_and_message.message.keys: kwargs['key'] = envelope_and_message.message.encoded_keys return create_message( envelope_and_message.envelope.pack(envelope_and_message.message), **kwargs ) except: logger.exception('Prepare failed')
def schematizer_client(self): """TODO[DATAPIPE-396|clin]: change this to be private once this class is converted to the true schematizer client. """ return get_config().schematizer_client
from uuid import UUID from data_pipeline._avro_payload import _AvroPayload from data_pipeline._encryption_helper import EncryptionHelper from data_pipeline._fast_uuid import FastUUID from data_pipeline.config import get_config from data_pipeline.envelope import Envelope from data_pipeline.helpers.lists import unlist from data_pipeline.helpers.yelp_avro_store import _AvroStringStore from data_pipeline.message_type import _ProtectedMessageType from data_pipeline.message_type import MessageType from data_pipeline.meta_attribute import MetaAttribute from data_pipeline.schematizer_clientlib.schematizer import get_schematizer logger = get_config().logger KafkaPositionInfo = namedtuple('KafkaPositionInfo', [ 'offset', # Offset of the message in the topic 'partition', # Partition of the topic the message was from 'key' # Key of the message, may be `None` ]) PayloadFieldDiff = namedtuple('PayloadFieldDiff', [ 'old_value', # Value of the field before update 'current_value' # Value of the field after update ])