Exemplo n.º 1
0
def sync_binlog_streams(mysql_conn, binlog_catalog, config, state):
    if binlog_catalog.streams:
        for stream in binlog_catalog.streams:
            write_schema_message(stream)

        with metrics.job_timer("sync_binlog"):
            binlog.sync_binlog_stream(mysql_conn, config, binlog_catalog.streams, state)
Exemplo n.º 2
0
def sync_binlog_streams(mysql_conn, binlog_catalog, config, state):

    if binlog_catalog.streams:
        for stream in binlog_catalog.streams:
            write_schema_message(stream)

        with metrics.job_timer('sync_binlog'):
            binlog_streams_map = binlog.generate_streams_map(binlog_catalog.streams)
            binlog.sync_binlog_stream(mysql_conn, config, binlog_streams_map, state)
Exemplo n.º 3
0
def do_sync_incremental(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream)

    md_map = metadata.to_map(catalog_entry.metadata)
    replication_key = md_map.get((), {}).get('replication-key')

    if not replication_key:
        raise Exception(
            f"Cannot use INCREMENTAL replication for table ({catalog_entry.stream}) without a replication key.")

    write_schema_message(catalog_entry=catalog_entry,
                         bookmark_properties=[replication_key])

    incremental.sync_table(mysql_conn, catalog_entry, state, columns)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 4
0
def do_sync_full_table(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication", catalog_entry.stream)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version")

    state = singer.write_bookmark(
        state, catalog_entry.tap_stream_id, "initial_full_table_complete", True
    )

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 5
0
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str):
    binlog.verify_binlog_config(mysql_conn)

    if use_gtid and engine == MYSQL_ENGINE:
        binlog.verify_gtid_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)

    if is_view:
        raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.")

    log_file = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state,
                                  catalog_entry.tap_stream_id,
                                  'log_pos')

    gtid = None
    if use_gtid:
        gtid = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'gtid')

    max_pk_values = singer.get_bookmark(state,
                                        catalog_entry.tap_stream_id,
                                        'max_pk_values')

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)):
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
    else:
        LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'initial_binlog_complete',
                                      False)

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn)

        current_gtid = None
        if use_gtid:
            current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)

            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

        else:
            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)
Exemplo n.º 6
0
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns):
    binlog.verify_binlog_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)

    if is_view:
        raise Exception(
            f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view."
        )

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_file")

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_pos")

    max_pk_values = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, "max_pk_values"
    )

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if log_file and log_pos and max_pk_values:
        LOGGER.info(
            "Resuming initial full table sync for LOG_BASED stream %s",
            catalog_entry.tap_stream_id,
        )
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    else:
        LOGGER.info(
            "Performing initial full table sync for LOG_BASED stream %s",
            catalog_entry.tap_stream_id,
        )

        state = singer.write_bookmark(
            state, catalog_entry.tap_stream_id, "initial_binlog_complete", False
        )

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(
            mysql_conn
        )
        state = singer.write_bookmark(
            state, catalog_entry.tap_stream_id, "version", stream_version
        )

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file and log_pos across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_file", current_log_file
            )

            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_pos", current_log_pos
            )

            full_table.sync_table(
                mysql_conn, catalog_entry, state, columns, stream_version
            )

        else:
            full_table.sync_table(
                mysql_conn, catalog_entry, state, columns, stream_version
            )
            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_file", current_log_file
            )

            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_pos", current_log_pos
            )
Exemplo n.º 7
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                # if a column no longer exists, the event will have something like __dropped_col_XY__
                # to refer to this column, we don't want these columns to be included in the difference
                diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True,
                                  get_db_column_types(binlog_event).keys())).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    LOGGER.debug('Difference between event and schema: %s',
                                 diff)
                    LOGGER.info('Running discovery ... ')

                    # run discovery for the current table only
                    new_catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    selected = {
                        k
                        for k, v in
                        new_catalog_entry.schema.properties.items()
                        if common.property_is_selected(new_catalog_entry, k)
                    }

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    new_catalog_entry.stream = tap_stream_id

                    # These are the columns we need to select
                    new_columns = desired_columns(selected,
                                                  new_catalog_entry.schema)

                    cols = set(new_catalog_entry.schema.properties.keys())

                    # drop unsupported properties from schema
                    for col in cols:
                        if col not in new_columns:
                            new_catalog_entry.schema.properties.pop(col, None)

                    # Add the _sdc_deleted_at col
                    new_columns = add_automatic_properties(
                        new_catalog_entry, list(new_columns))

                    # send the new scheme to target if we have a new schema
                    if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                        write_schema_message(catalog_entry=new_catalog_entry)
                        catalog_entry = new_catalog_entry

                        # update this dictionary while we're at it
                        binlog_streams_map[tap_stream_id][
                            'catalog_entry'] = new_catalog_entry
                        binlog_streams_map[tap_stream_id][
                            'desired_columns'] = new_columns
                        columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Exemplo n.º 8
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                diff = set(get_db_column_types(binlog_event).keys()).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    #run discovery for the current table only
                    catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    catalog_entry.stream = tap_stream_id
                    desired_columns = list(
                        catalog_entry.schema.properties.keys())

                    # Add the _sdc_deleted_at col
                    add_automatic_properties(catalog_entry, desired_columns)

                    # update this dictionary while we're at it
                    binlog_streams_map[tap_stream_id][
                        'catalog_entry'] = catalog_entry
                    binlog_streams_map[tap_stream_id][
                        'desired_columns'] = desired_columns

                    # send the new scheme to target
                    write_schema_message(catalog_entry=catalog_entry)

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Exemplo n.º 9
0
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader,
                     binlog_streams_map: Dict, state: Dict, config: Dict,
                     end_log_file: str, end_log_pos: int):

    processed_rows_events = 0
    events_skipped = 0

    log_file = None
    log_pos = None
    gtid_pos = reader.auto_position  # initial gtid, we set this when we created the reader's instance

    # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types.
    # Saving them here to avoid doing the check if we should ignore a column over and over again
    ignored_columns = set()

    # Exit from the loop when the reader either runs out of streams to return or we reach
    # the end position (which is Master's)
    for binlog_event in reader:

        # get reader current binlog file and position
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if (log_file > end_log_file) or (end_log_file == log_file
                                         and log_pos >= end_log_pos):
            LOGGER.info(
                'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!',
                log_file, log_pos)

            # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master
            # binlog file and position above, making the latter behind the stream reader and it causes some data loss
            # in the next run by skipping everything between end_log_file and log_pos
            # so we need to update log_pos back to master's position
            log_file = end_log_file
            log_pos = end_log_pos

            break

        if isinstance(binlog_event, RotateEvent):
            LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d',
                         binlog_event.next_binlog, binlog_event.position)

            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position, gtid_pos)

        elif isinstance(binlog_event, MariadbGtidEvent) or isinstance(
                binlog_event, GtidEvent):
            gtid_pos = binlog_event.gtid

            LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__,
                         gtid_pos)

            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)

            # There is strange behavior happening when using GTID in the pymysqlreplication lib,
            # explained here: https://github.com/noplay/python-mysql-replication/issues/367
            # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart
            # consuming binlog from old GTID pos when connection to server is lost.
            reader.auto_position = gtid_pos

        else:
            time_extracted = utils.now()

            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped += 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, processed_rows_events)
            else:
                # Compare event's columns to the schema properties
                diff = __get_diff_in_columns_list(
                    binlog_event, catalog_entry.schema.properties.keys(),
                    ignored_columns)

                # If there are additional cols in the event then run discovery if needed and update the catalog
                if diff:

                    LOGGER.info(
                        'Stream `%s`: Difference detected between event and schema: %s',
                        tap_stream_id, diff)

                    md_map = metadata.to_map(catalog_entry.metadata)

                    if not should_run_discovery(diff, md_map):
                        LOGGER.info(
                            'Stream `%s`: Not running discovery. Ignoring all detected columns in %s',
                            tap_stream_id, diff)
                        ignored_columns = ignored_columns.union(diff)

                    else:
                        LOGGER.info('Stream `%s`: Running discovery ... ',
                                    tap_stream_id)

                        # run discovery for the current table only
                        new_catalog_entry = discover_catalog(
                            mysql_conn, config.get('filter_dbs'),
                            catalog_entry.table).streams[0]

                        selected = {
                            k
                            for k, v in
                            new_catalog_entry.schema.properties.items()
                            if common.property_is_selected(
                                new_catalog_entry, k)
                        }

                        # the new catalog has "stream" property = table name, we need to update that to make it the
                        # same as the result of the "resolve_catalog" function
                        new_catalog_entry.stream = tap_stream_id

                        # These are the columns we need to select
                        new_columns = desired_columns(selected,
                                                      new_catalog_entry.schema)

                        cols = set(new_catalog_entry.schema.properties.keys())

                        # drop unsupported properties from schema
                        for col in cols:
                            if col not in new_columns:
                                new_catalog_entry.schema.properties.pop(
                                    col, None)

                        # Add the _sdc_deleted_at col
                        new_columns = add_automatic_properties(
                            new_catalog_entry, list(new_columns))

                        # send the new scheme to target if we have a new schema
                        if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                            write_schema_message(
                                catalog_entry=new_catalog_entry)
                            catalog_entry = new_catalog_entry

                            # update this dictionary while we're at it
                            binlog_streams_map[tap_stream_id][
                                'catalog_entry'] = new_catalog_entry
                            binlog_streams_map[tap_stream_id][
                                'desired_columns'] = new_columns
                            columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    processed_rows_events = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    processed_rows_events = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    processed_rows_events = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update singer bookmark and send STATE message periodically
        if ((processed_rows_events
             and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    LOGGER.info('Processed %s rows', processed_rows_events)

    # Update singer bookmark at the last time to point it the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos,
                                 gtid_pos)