Exemplo n.º 1
0
 def test_schema_event(self, mock_db_connections, patch_stream):
     gtid_event = mock.Mock(spec=GtidEvent)
     schema_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         gtid_event, schema_event
     ]
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         GtidPosition(gtid="sid:5"))
     assert stream.peek() == gtid_event
     assert stream.pop() == gtid_event
     assert stream.peek() == schema_event
     assert stream.pop() == schema_event
 def test_schema_event(self, mock_db_connections, patch_stream):
     gtid_event = mock.Mock(spec=GtidEvent)
     schema_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         gtid_event,
         schema_event
     ]
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         GtidPosition(gtid="sid:5")
     )
     assert stream.peek() == gtid_event
     assert stream.pop() == gtid_event
     assert stream.peek() == schema_event
     assert stream.pop() == schema_event
Exemplo n.º 3
0
 def test_none_events(self, mock_db_connections, patch_stream):
     query_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         None,
         query_event,
     ]
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         LogPosition(
             log_pos=100,
             log_file="binlog.001",
         ))
     assert stream.peek() == query_event
     assert stream.pop() == query_event
 def test_none_events(self, mock_db_connections, patch_stream):
     query_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         None,
         query_event,
     ]
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         LogPosition(
             log_pos=100,
             log_file="binlog.001",
         )
     )
     assert stream.peek() == query_event
     assert stream.pop() == query_event
Exemplo n.º 5
0
 def test_flattern_data_events(self, mock_db_connections, patch_stream):
     data_event = self._prepare_data_event('fake_table')
     gtid_event = mock.Mock(spec=GtidEvent)
     query_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         gtid_event,
         query_event,
         data_event,
     ]
     assert len(data_event.rows) == 3
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         LogPosition(
             log_pos=100,
             log_file="binlog.001",
         ))
     assert stream.peek() == gtid_event
     assert stream.pop() == gtid_event
     assert stream.pop() == query_event
     assert stream.pop().row == data_event.rows[0]
     assert stream.pop().row == data_event.rows[1]
     assert stream.pop().row == data_event.rows[2]
 def test_flattern_data_events(self, mock_db_connections, patch_stream):
     data_event = self._prepare_data_event('fake_table')
     gtid_event = mock.Mock(spec=GtidEvent)
     query_event = mock.Mock(spec=QueryEvent)
     patch_stream.return_value.fetchone.side_effect = [
         gtid_event,
         query_event,
         data_event,
     ]
     assert len(data_event.rows) == 3
     stream = LowLevelBinlogStreamReaderWrapper(
         mock_db_connections.source_database_config,
         mock_db_connections.tracker_database_config,
         LogPosition(
             log_pos=100,
             log_file="binlog.001",
         )
     )
     assert stream.peek() == gtid_event
     assert stream.pop() == gtid_event
     assert stream.pop() == query_event
     assert stream.pop().row == data_event.rows[0]
     assert stream.pop().row == data_event.rows[1]
     assert stream.pop().row == data_event.rows[2]
Exemplo n.º 7
0
class SimpleBinlogStreamReaderWrapper(BaseBinlogStreamReaderWrapper):
    """ This class is a higher level abstraction on top of LowLevelBinlogStreamReaderWrapper,
    focusing on dealing with offsets, and providing the ability to iterate through
    events with position information attached.

    Args:
      source_database_config(dict): source database connection configuration.
      position(Position object): use to specify where the stream should resume.
      gtid_enabled(bool): use to indicate if gtid is enabled in the system.
    """
    def __init__(self,
                 source_database_config,
                 tracker_database_config,
                 position,
                 gtid_enabled=False):
        super(SimpleBinlogStreamReaderWrapper, self).__init__()
        self.stream = LowLevelBinlogStreamReaderWrapper(
            source_database_config, tracker_database_config, position)
        self.gtid_enabled = gtid_enabled
        self._upstream_position = position
        self._offset = 0
        self._set_sensu_alert_manager()
        self._set_meteorite_gauge_manager()
        self._seek(self._upstream_position.offset)

    @classmethod
    def is_meteorite_sensu_supported(cls):
        try:
            # TODO(DATAPIPE-1509|abrar): Currently we have
            # force_avoid_internal_packages as a means of simulating an absence
            # of a yelp's internal package. And all references
            # of force_avoid_internal_packages have to be removed from
            # RH after we are completely ready for open source.
            if is_avoid_internal_packages_set():
                raise ImportError
            from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager  # NOQA
            from data_pipeline.tools.sensu_alert_manager import SensuAlertManager  # NOQA
            return True
        except ImportError:
            return False

    def _set_sensu_alert_manager(self):
        if not self.is_meteorite_sensu_supported():
            self.sensu_alert_manager = None
            return

        from data_pipeline.tools.sensu_alert_manager import SensuAlertManager

        sensu_result_dict = {
            'name': 'replication_handler_real_time_check',
            'output': 'Replication Handler has caught up with real time.',
            'runbook': ' y/replication_handler ',
            'status': 0,
            'team': 'bam',
            'page': False,
            'notification_email': '*****@*****.**',
            'check_every':
            '{time}s'.format(time=sensu_alert_interval_in_seconds),
            'alert_after': '5m',
            'ttl': '300s',
            'sensu_host': config.env_config.sensu_host,
            'source': config.env_config.sensu_source,
        }
        self.sensu_alert_manager = SensuAlertManager(
            sensu_alert_interval_in_seconds,
            service_name='Replication Handler',
            result_dict=sensu_result_dict,
            max_delay_seconds=config.env_config.max_delay_allowed_in_seconds,
            disable=config.env_config.disable_sensu,
        )

    def _set_meteorite_gauge_manager(self):
        if not self.is_meteorite_sensu_supported():
            self.meteorite_gauge_manager = None
            return

        from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager

        self.meteorite_gauge_manager = MeteoriteGaugeManager(
            meteorite_interval_in_seconds,
            stats_gauge_name='replication_handler_delay_seconds',
            container_name=config.env_config.container_name,
            container_env=config.env_config.container_env,
            disable=config.env_config.disable_meteorite,
            rbr_source_cluster=config.env_config.rbr_source_cluster)

    def __iter__(self):
        return self

    def next(self):
        """ This method implements the iteration functionality."""
        return self.pop()

    def _seek(self, offset):
        if offset is not None:
            self._point_stream_to(offset)

    def _point_stream_to(self, offset):
        """This method advances the internal dequeue to provided offset.
        """
        original_offset = offset
        while offset >= 0:
            self.pop()
            offset -= 1

        # Make sure that we skipped correct number of events.
        log.info("self._offset is {}".format(self._offset))
        log.info("original_offset is {}".format(original_offset))
        assert self._offset == original_offset + 1

    def _is_position_update(self, event):
        if self.gtid_enabled:
            return isinstance(event, GtidEvent)
        else:
            return event.schema == HEARTBEAT_DB and hasattr(event, 'row')

    def _update_upstream_position(self, event):
        """If gtid_enabled and the next event is GtidEvent,
        we update the self._upstream_position with GtidPosition, if next event is
        not GtidEvent, we keep the current self._upstream_position, if not gtid_enabled,
        we update the self.upstream_position with LogPosition.
        TODO(cheng|DATAPIPE-172): We may need to skip duplicate heartbeats.
        """
        if self.gtid_enabled and isinstance(event, GtidEvent):
            self._upstream_position = GtidPosition(gtid=event.gtid)
        elif (not self.gtid_enabled
              ) and event.schema == HEARTBEAT_DB and hasattr(event, 'row'):
            # row['after_values']['timestamp'] should be a datetime object without tzinfo.
            # we need to give it a local timezone.
            timestamp = self._add_tz_info_to_tz_naive_timestamp(
                event.row["after_values"]["timestamp"])
            if self.sensu_alert_manager and self.meteorite_gauge_manager:
                self.sensu_alert_manager.periodic_process(timestamp)
                self.meteorite_gauge_manager.periodic_process(timestamp)
            self._log_process(timestamp, event.log_file, event.log_pos)
            self._upstream_position = LogPosition(
                log_pos=event.log_pos,
                log_file=event.log_file,
                hb_serial=event.row["after_values"]["serial"],
                hb_timestamp=calendar.timegm(timestamp.utctimetuple()),
            )
        self._offset = 0

    def _add_tz_info_to_tz_naive_timestamp(self, timestamp):
        if timestamp.tzinfo is None:
            timestamp = timestamp.replace(tzinfo=tzlocal())
        return timestamp

    def _log_process(self, timestamp, log_file, log_pos):
        # Change the timezone of timestamp to PST(local timezone in SF)
        now = datetime.datetime.now(tzutc())
        delay_seconds = (now - timestamp).total_seconds()
        log.info(
            "Processing timestamp is {timestamp}, delay is {delay_seconds} seconds, log position is {log_file}: {log_pos}"
            .format(
                timestamp=timestamp.replace(
                    tzinfo=pytz.timezone('US/Pacific')),
                log_file=log_file,
                log_pos=log_pos,
                delay_seconds=delay_seconds,
            ))

    def _refill_current_events(self):
        if not self.current_events:
            # If the site goes into readonly mode, there are only heartbeats, we should just
            # update the position.
            while self._is_position_update(self.stream.peek()):
                self._update_upstream_position(self.stream.pop())
            event = self.stream.pop()
            replication_handler_event = ReplicationHandlerEvent(
                position=self._build_position(), event=event)
            self._offset += 1
            self.current_events.append(replication_handler_event)

    def _build_position(self):
        """ We need to instantiate a new position for each event."""
        if self.gtid_enabled:
            return GtidPosition(gtid=self._upstream_position.gtid,
                                offset=self._offset)
        else:
            return LogPosition(
                log_pos=self._upstream_position.log_pos,
                log_file=self._upstream_position.log_file,
                offset=self._offset,
                hb_serial=self._upstream_position.hb_serial,
                hb_timestamp=self._upstream_position.hb_timestamp,
            )
class SimpleBinlogStreamReaderWrapper(BaseBinlogStreamReaderWrapper):
    """ This class is a higher level abstraction on top of LowLevelBinlogStreamReaderWrapper,
    focusing on dealing with offsets, and providing the ability to iterate through
    events with position information attached.

    Args:
      source_database_config(dict): source database connection configuration.
      position(Position object): use to specify where the stream should resume.
      gtid_enabled(bool): use to indicate if gtid is enabled in the system.
    """

    def __init__(
        self,
        source_database_config,
        tracker_database_config,
        position,
        gtid_enabled=False
    ):
        super(SimpleBinlogStreamReaderWrapper, self).__init__()
        self.stream = LowLevelBinlogStreamReaderWrapper(
            source_database_config, tracker_database_config, position
        )
        self.gtid_enabled = gtid_enabled
        self._upstream_position = position
        self._offset = 0
        self._set_sensu_alert_manager()
        self._set_meteorite_gauge_manager()
        self._seek(self._upstream_position.offset)

    @classmethod
    def is_meteorite_sensu_supported(cls):
        try:
            from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager  # NOQA
            from data_pipeline.tools.sensu_alert_manager import SensuAlertManager  # NOQA
            return True
        except ImportError:
            return False

    def _set_sensu_alert_manager(self):
        if not self.is_meteorite_sensu_supported():
            self.sensu_alert_manager = None
            return

        from data_pipeline.tools.sensu_alert_manager import SensuAlertManager

        sensu_result_dict = {
            'name': 'replication_handler_real_time_check',
            'output': 'Replication Handler has caught up with real time.',
            'runbook': ' y/replication_handler ',
            'status': 0,
            'team': 'bam',
            'page': False,
            'notification_email': '*****@*****.**',
            'check_every': '{time}s'.format(time=sensu_alert_interval_in_seconds),
            'alert_after': '5m',
            'ttl': '300s',
            'sensu_host': config.env_config.sensu_host,
            'source': config.env_config.sensu_source,
        }
        self.sensu_alert_manager = SensuAlertManager(
            sensu_alert_interval_in_seconds,
            service_name='Replication Handler',
            result_dict=sensu_result_dict,
            max_delay_seconds=config.env_config.max_delay_allowed_in_seconds,
            disable=config.env_config.disable_sensu,
        )

    def _set_meteorite_gauge_manager(self):
        if not self.is_meteorite_sensu_supported():
            self.meteorite_gauge_manager = None
            return

        from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager

        self.meteorite_gauge_manager = MeteoriteGaugeManager(
            meteorite_interval_in_seconds,
            stats_gauge_name='replication_handler_delay_seconds',
            container_name=config.env_config.container_name,
            container_env=config.env_config.container_env,
            disable=config.env_config.disable_meteorite,
            rbr_source_cluster=config.env_config.rbr_source_cluster
        )

    def __iter__(self):
        return self

    def next(self):
        """ This method implements the iteration functionality."""
        return self.pop()

    def _seek(self, offset):
        if offset is not None:
            self._point_stream_to(offset)

    def _point_stream_to(self, offset):
        """This method advances the internal dequeue to provided offset.
        """
        original_offset = offset
        while offset >= 0:
            self.pop()
            offset -= 1

        # Make sure that we skipped correct number of events.
        log.info("self._offset is {}".format(self._offset))
        log.info("original_offset is {}".format(original_offset))
        assert self._offset == original_offset + 1

    def _is_position_update(self, event):
        if self.gtid_enabled:
            return isinstance(event, GtidEvent)
        else:
            return event.schema == HEARTBEAT_DB and hasattr(event, 'row')

    def _update_upstream_position(self, event):
        """If gtid_enabled and the next event is GtidEvent,
        we update the self._upstream_position with GtidPosition, if next event is
        not GtidEvent, we keep the current self._upstream_position, if not gtid_enabled,
        we update the self.upstream_position with LogPosition.
        TODO(cheng|DATAPIPE-172): We may need to skip duplicate heartbeats.
        """
        if self.gtid_enabled and isinstance(event, GtidEvent):
            self._upstream_position = GtidPosition(
                gtid=event.gtid
            )
        elif (not self.gtid_enabled) and event.schema == HEARTBEAT_DB and hasattr(event, 'row'):
            # row['after_values']['timestamp'] should be a datetime object without tzinfo.
            # we need to give it a local timezone.
            timestamp = self._add_tz_info_to_tz_naive_timestamp(
                event.row["after_values"]["timestamp"]
            )
            if self.sensu_alert_manager and self.meteorite_gauge_manager:
                self.sensu_alert_manager.periodic_process(timestamp)
                self.meteorite_gauge_manager.periodic_process(timestamp)
            self._log_process(timestamp, event.log_file, event.log_pos)
            self._upstream_position = LogPosition(
                log_pos=event.log_pos,
                log_file=event.log_file,
                hb_serial=event.row["after_values"]["serial"],
                hb_timestamp=calendar.timegm(timestamp.utctimetuple()),
            )
        self._offset = 0

    def _add_tz_info_to_tz_naive_timestamp(self, timestamp):
        if timestamp.tzinfo is None:
            timestamp = timestamp.replace(tzinfo=tzlocal())
        return timestamp

    def _log_process(self, timestamp, log_file, log_pos):
        # Change the timezone of timestamp to PST(local timezone in SF)
        now = datetime.datetime.now(tzutc())
        delay_seconds = (now - timestamp).total_seconds()
        log.info(
            "Processing timestamp is {timestamp}, delay is {delay_seconds} seconds, log position is {log_file}: {log_pos}".format(
                timestamp=timestamp.replace(tzinfo=pytz.timezone('US/Pacific')),
                log_file=log_file,
                log_pos=log_pos,
                delay_seconds=delay_seconds,
            )
        )

    def _refill_current_events(self):
        if not self.current_events:
            # If the site goes into readonly mode, there are only heartbeats, we should just
            # update the position.
            while self._is_position_update(self.stream.peek()):
                self._update_upstream_position(self.stream.pop())
            event = self.stream.pop()
            replication_handler_event = ReplicationHandlerEvent(
                position=self._build_position(),
                event=event
            )
            self._offset += 1
            self.current_events.append(replication_handler_event)

    def _build_position(self):
        """ We need to instantiate a new position for each event."""
        if self.gtid_enabled:
            return GtidPosition(
                gtid=self._upstream_position.gtid,
                offset=self._offset
            )
        else:
            return LogPosition(
                log_pos=self._upstream_position.log_pos,
                log_file=self._upstream_position.log_file,
                offset=self._offset,
                hb_serial=self._upstream_position.hb_serial,
                hb_timestamp=self._upstream_position.hb_timestamp,
            )