def test_schema_event(self, mock_db_connections, patch_stream): gtid_event = mock.Mock(spec=GtidEvent) schema_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ gtid_event, schema_event ] stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, GtidPosition(gtid="sid:5")) assert stream.peek() == gtid_event assert stream.pop() == gtid_event assert stream.peek() == schema_event assert stream.pop() == schema_event
def test_schema_event(self, mock_db_connections, patch_stream): gtid_event = mock.Mock(spec=GtidEvent) schema_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ gtid_event, schema_event ] stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, GtidPosition(gtid="sid:5") ) assert stream.peek() == gtid_event assert stream.pop() == gtid_event assert stream.peek() == schema_event assert stream.pop() == schema_event
def test_none_events(self, mock_db_connections, patch_stream): query_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ None, query_event, ] stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, LogPosition( log_pos=100, log_file="binlog.001", )) assert stream.peek() == query_event assert stream.pop() == query_event
def test_none_events(self, mock_db_connections, patch_stream): query_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ None, query_event, ] stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, LogPosition( log_pos=100, log_file="binlog.001", ) ) assert stream.peek() == query_event assert stream.pop() == query_event
def test_flattern_data_events(self, mock_db_connections, patch_stream): data_event = self._prepare_data_event('fake_table') gtid_event = mock.Mock(spec=GtidEvent) query_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ gtid_event, query_event, data_event, ] assert len(data_event.rows) == 3 stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, LogPosition( log_pos=100, log_file="binlog.001", )) assert stream.peek() == gtid_event assert stream.pop() == gtid_event assert stream.pop() == query_event assert stream.pop().row == data_event.rows[0] assert stream.pop().row == data_event.rows[1] assert stream.pop().row == data_event.rows[2]
def test_flattern_data_events(self, mock_db_connections, patch_stream): data_event = self._prepare_data_event('fake_table') gtid_event = mock.Mock(spec=GtidEvent) query_event = mock.Mock(spec=QueryEvent) patch_stream.return_value.fetchone.side_effect = [ gtid_event, query_event, data_event, ] assert len(data_event.rows) == 3 stream = LowLevelBinlogStreamReaderWrapper( mock_db_connections.source_database_config, mock_db_connections.tracker_database_config, LogPosition( log_pos=100, log_file="binlog.001", ) ) assert stream.peek() == gtid_event assert stream.pop() == gtid_event assert stream.pop() == query_event assert stream.pop().row == data_event.rows[0] assert stream.pop().row == data_event.rows[1] assert stream.pop().row == data_event.rows[2]
class SimpleBinlogStreamReaderWrapper(BaseBinlogStreamReaderWrapper): """ This class is a higher level abstraction on top of LowLevelBinlogStreamReaderWrapper, focusing on dealing with offsets, and providing the ability to iterate through events with position information attached. Args: source_database_config(dict): source database connection configuration. position(Position object): use to specify where the stream should resume. gtid_enabled(bool): use to indicate if gtid is enabled in the system. """ def __init__(self, source_database_config, tracker_database_config, position, gtid_enabled=False): super(SimpleBinlogStreamReaderWrapper, self).__init__() self.stream = LowLevelBinlogStreamReaderWrapper( source_database_config, tracker_database_config, position) self.gtid_enabled = gtid_enabled self._upstream_position = position self._offset = 0 self._set_sensu_alert_manager() self._set_meteorite_gauge_manager() self._seek(self._upstream_position.offset) @classmethod def is_meteorite_sensu_supported(cls): try: # TODO(DATAPIPE-1509|abrar): Currently we have # force_avoid_internal_packages as a means of simulating an absence # of a yelp's internal package. And all references # of force_avoid_internal_packages have to be removed from # RH after we are completely ready for open source. if is_avoid_internal_packages_set(): raise ImportError from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager # NOQA from data_pipeline.tools.sensu_alert_manager import SensuAlertManager # NOQA return True except ImportError: return False def _set_sensu_alert_manager(self): if not self.is_meteorite_sensu_supported(): self.sensu_alert_manager = None return from data_pipeline.tools.sensu_alert_manager import SensuAlertManager sensu_result_dict = { 'name': 'replication_handler_real_time_check', 'output': 'Replication Handler has caught up with real time.', 'runbook': ' y/replication_handler ', 'status': 0, 'team': 'bam', 'page': False, 'notification_email': '*****@*****.**', 'check_every': '{time}s'.format(time=sensu_alert_interval_in_seconds), 'alert_after': '5m', 'ttl': '300s', 'sensu_host': config.env_config.sensu_host, 'source': config.env_config.sensu_source, } self.sensu_alert_manager = SensuAlertManager( sensu_alert_interval_in_seconds, service_name='Replication Handler', result_dict=sensu_result_dict, max_delay_seconds=config.env_config.max_delay_allowed_in_seconds, disable=config.env_config.disable_sensu, ) def _set_meteorite_gauge_manager(self): if not self.is_meteorite_sensu_supported(): self.meteorite_gauge_manager = None return from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager self.meteorite_gauge_manager = MeteoriteGaugeManager( meteorite_interval_in_seconds, stats_gauge_name='replication_handler_delay_seconds', container_name=config.env_config.container_name, container_env=config.env_config.container_env, disable=config.env_config.disable_meteorite, rbr_source_cluster=config.env_config.rbr_source_cluster) def __iter__(self): return self def next(self): """ This method implements the iteration functionality.""" return self.pop() def _seek(self, offset): if offset is not None: self._point_stream_to(offset) def _point_stream_to(self, offset): """This method advances the internal dequeue to provided offset. """ original_offset = offset while offset >= 0: self.pop() offset -= 1 # Make sure that we skipped correct number of events. log.info("self._offset is {}".format(self._offset)) log.info("original_offset is {}".format(original_offset)) assert self._offset == original_offset + 1 def _is_position_update(self, event): if self.gtid_enabled: return isinstance(event, GtidEvent) else: return event.schema == HEARTBEAT_DB and hasattr(event, 'row') def _update_upstream_position(self, event): """If gtid_enabled and the next event is GtidEvent, we update the self._upstream_position with GtidPosition, if next event is not GtidEvent, we keep the current self._upstream_position, if not gtid_enabled, we update the self.upstream_position with LogPosition. TODO(cheng|DATAPIPE-172): We may need to skip duplicate heartbeats. """ if self.gtid_enabled and isinstance(event, GtidEvent): self._upstream_position = GtidPosition(gtid=event.gtid) elif (not self.gtid_enabled ) and event.schema == HEARTBEAT_DB and hasattr(event, 'row'): # row['after_values']['timestamp'] should be a datetime object without tzinfo. # we need to give it a local timezone. timestamp = self._add_tz_info_to_tz_naive_timestamp( event.row["after_values"]["timestamp"]) if self.sensu_alert_manager and self.meteorite_gauge_manager: self.sensu_alert_manager.periodic_process(timestamp) self.meteorite_gauge_manager.periodic_process(timestamp) self._log_process(timestamp, event.log_file, event.log_pos) self._upstream_position = LogPosition( log_pos=event.log_pos, log_file=event.log_file, hb_serial=event.row["after_values"]["serial"], hb_timestamp=calendar.timegm(timestamp.utctimetuple()), ) self._offset = 0 def _add_tz_info_to_tz_naive_timestamp(self, timestamp): if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tzlocal()) return timestamp def _log_process(self, timestamp, log_file, log_pos): # Change the timezone of timestamp to PST(local timezone in SF) now = datetime.datetime.now(tzutc()) delay_seconds = (now - timestamp).total_seconds() log.info( "Processing timestamp is {timestamp}, delay is {delay_seconds} seconds, log position is {log_file}: {log_pos}" .format( timestamp=timestamp.replace( tzinfo=pytz.timezone('US/Pacific')), log_file=log_file, log_pos=log_pos, delay_seconds=delay_seconds, )) def _refill_current_events(self): if not self.current_events: # If the site goes into readonly mode, there are only heartbeats, we should just # update the position. while self._is_position_update(self.stream.peek()): self._update_upstream_position(self.stream.pop()) event = self.stream.pop() replication_handler_event = ReplicationHandlerEvent( position=self._build_position(), event=event) self._offset += 1 self.current_events.append(replication_handler_event) def _build_position(self): """ We need to instantiate a new position for each event.""" if self.gtid_enabled: return GtidPosition(gtid=self._upstream_position.gtid, offset=self._offset) else: return LogPosition( log_pos=self._upstream_position.log_pos, log_file=self._upstream_position.log_file, offset=self._offset, hb_serial=self._upstream_position.hb_serial, hb_timestamp=self._upstream_position.hb_timestamp, )
class SimpleBinlogStreamReaderWrapper(BaseBinlogStreamReaderWrapper): """ This class is a higher level abstraction on top of LowLevelBinlogStreamReaderWrapper, focusing on dealing with offsets, and providing the ability to iterate through events with position information attached. Args: source_database_config(dict): source database connection configuration. position(Position object): use to specify where the stream should resume. gtid_enabled(bool): use to indicate if gtid is enabled in the system. """ def __init__( self, source_database_config, tracker_database_config, position, gtid_enabled=False ): super(SimpleBinlogStreamReaderWrapper, self).__init__() self.stream = LowLevelBinlogStreamReaderWrapper( source_database_config, tracker_database_config, position ) self.gtid_enabled = gtid_enabled self._upstream_position = position self._offset = 0 self._set_sensu_alert_manager() self._set_meteorite_gauge_manager() self._seek(self._upstream_position.offset) @classmethod def is_meteorite_sensu_supported(cls): try: from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager # NOQA from data_pipeline.tools.sensu_alert_manager import SensuAlertManager # NOQA return True except ImportError: return False def _set_sensu_alert_manager(self): if not self.is_meteorite_sensu_supported(): self.sensu_alert_manager = None return from data_pipeline.tools.sensu_alert_manager import SensuAlertManager sensu_result_dict = { 'name': 'replication_handler_real_time_check', 'output': 'Replication Handler has caught up with real time.', 'runbook': ' y/replication_handler ', 'status': 0, 'team': 'bam', 'page': False, 'notification_email': '*****@*****.**', 'check_every': '{time}s'.format(time=sensu_alert_interval_in_seconds), 'alert_after': '5m', 'ttl': '300s', 'sensu_host': config.env_config.sensu_host, 'source': config.env_config.sensu_source, } self.sensu_alert_manager = SensuAlertManager( sensu_alert_interval_in_seconds, service_name='Replication Handler', result_dict=sensu_result_dict, max_delay_seconds=config.env_config.max_delay_allowed_in_seconds, disable=config.env_config.disable_sensu, ) def _set_meteorite_gauge_manager(self): if not self.is_meteorite_sensu_supported(): self.meteorite_gauge_manager = None return from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager self.meteorite_gauge_manager = MeteoriteGaugeManager( meteorite_interval_in_seconds, stats_gauge_name='replication_handler_delay_seconds', container_name=config.env_config.container_name, container_env=config.env_config.container_env, disable=config.env_config.disable_meteorite, rbr_source_cluster=config.env_config.rbr_source_cluster ) def __iter__(self): return self def next(self): """ This method implements the iteration functionality.""" return self.pop() def _seek(self, offset): if offset is not None: self._point_stream_to(offset) def _point_stream_to(self, offset): """This method advances the internal dequeue to provided offset. """ original_offset = offset while offset >= 0: self.pop() offset -= 1 # Make sure that we skipped correct number of events. log.info("self._offset is {}".format(self._offset)) log.info("original_offset is {}".format(original_offset)) assert self._offset == original_offset + 1 def _is_position_update(self, event): if self.gtid_enabled: return isinstance(event, GtidEvent) else: return event.schema == HEARTBEAT_DB and hasattr(event, 'row') def _update_upstream_position(self, event): """If gtid_enabled and the next event is GtidEvent, we update the self._upstream_position with GtidPosition, if next event is not GtidEvent, we keep the current self._upstream_position, if not gtid_enabled, we update the self.upstream_position with LogPosition. TODO(cheng|DATAPIPE-172): We may need to skip duplicate heartbeats. """ if self.gtid_enabled and isinstance(event, GtidEvent): self._upstream_position = GtidPosition( gtid=event.gtid ) elif (not self.gtid_enabled) and event.schema == HEARTBEAT_DB and hasattr(event, 'row'): # row['after_values']['timestamp'] should be a datetime object without tzinfo. # we need to give it a local timezone. timestamp = self._add_tz_info_to_tz_naive_timestamp( event.row["after_values"]["timestamp"] ) if self.sensu_alert_manager and self.meteorite_gauge_manager: self.sensu_alert_manager.periodic_process(timestamp) self.meteorite_gauge_manager.periodic_process(timestamp) self._log_process(timestamp, event.log_file, event.log_pos) self._upstream_position = LogPosition( log_pos=event.log_pos, log_file=event.log_file, hb_serial=event.row["after_values"]["serial"], hb_timestamp=calendar.timegm(timestamp.utctimetuple()), ) self._offset = 0 def _add_tz_info_to_tz_naive_timestamp(self, timestamp): if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tzlocal()) return timestamp def _log_process(self, timestamp, log_file, log_pos): # Change the timezone of timestamp to PST(local timezone in SF) now = datetime.datetime.now(tzutc()) delay_seconds = (now - timestamp).total_seconds() log.info( "Processing timestamp is {timestamp}, delay is {delay_seconds} seconds, log position is {log_file}: {log_pos}".format( timestamp=timestamp.replace(tzinfo=pytz.timezone('US/Pacific')), log_file=log_file, log_pos=log_pos, delay_seconds=delay_seconds, ) ) def _refill_current_events(self): if not self.current_events: # If the site goes into readonly mode, there are only heartbeats, we should just # update the position. while self._is_position_update(self.stream.peek()): self._update_upstream_position(self.stream.pop()) event = self.stream.pop() replication_handler_event = ReplicationHandlerEvent( position=self._build_position(), event=event ) self._offset += 1 self.current_events.append(replication_handler_event) def _build_position(self): """ We need to instantiate a new position for each event.""" if self.gtid_enabled: return GtidPosition( gtid=self._upstream_position.gtid, offset=self._offset ) else: return LogPosition( log_pos=self._upstream_position.log_pos, log_file=self._upstream_position.log_file, offset=self._offset, hb_serial=self._upstream_position.hb_serial, hb_timestamp=self._upstream_position.hb_timestamp, )