예제 #1
0
def do_sync_full_table(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication",
                catalog_entry.stream)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)

    full_table.sync_table(mysql_conn, catalog_entry, state, columns,
                          stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version')

    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'initial_full_table_complete', True)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #2
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id in binlog_streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    if config.get("server_id"):
        server_id = int(config.get("server_id"))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = fetch_server_id(mysql_conn)
        LOGGER.info("No server_id provided, will use global server_id=%s",
                    server_id)

    connection_wrapper = make_connection_wrapper(config)

    try:
        reader = BinLogStreamReader(
            connection_settings={},
            server_id=server_id,
            slave_uuid="stitch-slave-{}".format(server_id),
            log_file=log_file,
            log_pos=log_pos,
            resume_stream=True,
            only_events=[
                RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent
            ],
            pymysql_wrapper=connection_wrapper,
        )
        LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                    log_file, log_pos)
        _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state)
    finally:
        # BinLogStreamReader doesn't implement the `with` methods
        # So, try/finally will close the chain from the top
        reader.close()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #3
0
def sync_traditional_stream(conn_config, stream, state, sync_method, end_scn):
   LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream.tap_stream_id, sync_method)
   md_map = metadata.to_map(stream.metadata)
   desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)]
   desired_columns.sort()
   if len(desired_columns) == 0:
      LOGGER.warning('There are no columns selected for stream %s, skipping it', stream.tap_stream_id)
      return state

   if sync_method == 'full':
      LOGGER.info("Stream %s is using full_table replication", stream.tap_stream_id)
      state = singer.set_currently_syncing(state, stream.tap_stream_id)
      common.send_schema_message(stream, [])
      if md_map.get((), {}).get('is-view'):
         state = full_table.sync_view(conn_config, stream, state, desired_columns)
      else:
         state = full_table.sync_table(conn_config, stream, state, desired_columns)
   elif sync_method == 'log_initial':
      #start off with full-table replication
      state = singer.set_currently_syncing(state, stream.tap_stream_id)
      LOGGER.info("stream %s is using log_miner. will use full table for first run", stream.tap_stream_id)

      state = singer.write_bookmark(state, stream.tap_stream_id, 'scn', end_scn)

      common.send_schema_message(stream, [])
      state = full_table.sync_table(conn_config, stream, state, desired_columns)
   elif sync_method == 'log_initial_interrupted':
      LOGGER.info("Initial stage of full table sync was interrupted. resuming...")
      state = singer.set_currently_syncing(state, stream.tap_stream_id)
      common.send_schema_message(stream, [])
      state = full_table.sync_table(conn_config, stream, state, desired_columns)
   elif sync_method == 'incremental':
      state = singer.set_currently_syncing(state, stream.tap_stream_id)
      state = do_sync_incremental(conn_config, stream, state, desired_columns)

   else:
      raise Exception("unknown sync method {} for stream {}".format(sync_method, stream.tap_stream_id))

   state = singer.set_currently_syncing(state, None)
   singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
   return state
예제 #4
0
    def test_round_trip(self):

        record_message = singer.RecordMessage(
            record={'name': 'foo'},
            stream='users')

        schema_message = singer.SchemaMessage(
            stream='users',
            key_properties=['name'],
            schema={'type': 'object',
                    'properties': {
                        'name': {'type': 'string'}}})

        state_message = singer.StateMessage(value={'seq': 1})
        
        self.assertEqual(record_message,
                         singer.parse_message(singer.format_message(record_message)))
        self.assertEqual(schema_message,
                         singer.parse_message(singer.format_message(schema_message)))
        self.assertEqual(state_message,
                         singer.parse_message(singer.format_message(state_message)))
예제 #5
0
def sync_traditional_stream(conn_config, stream, state, sync_method):
    LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream['tap_stream_id'], sync_method)
    md_map = metadata.to_map(stream['metadata'])
    conn_config['dbname'] = md_map.get(()).get('database-name')
    desired_columns = [c for c in stream['schema']['properties'].keys() if should_sync_column(md_map, c)]
    desired_columns.sort()

    if len(desired_columns) == 0:
        LOGGER.warning('There are no columns selected for stream %s, skipping it', stream['tap_stream_id'])
        return state

    register_type_adapters(conn_config)

    if sync_method == 'full':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map)
    elif sync_method == 'incremental':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map)
    elif sync_method == 'logical_initial':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.info("Performing initial full table sync")
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn', end_lsn)

        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map)
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None)
    elif sync_method == 'logical_initial_interrupted':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        LOGGER.info("Initial stage of full table sync was interrupted. resuming...")
        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map)
    else:
        raise Exception("unknown sync method {} for stream {}".format(sync_method, stream['tap_stream_id']))

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
예제 #6
0
def sync_table(conn, catalog_entry, state):
    metadata = singer_metadata.to_map(catalog_entry.metadata)
    LOGGER.info("Beginning sync for {}.".format(catalog_entry.stream))

    entity = catalog_entry.tap_stream_id
    if not entity:
        return

    start = get_start(entity, state)
    formatted_start = dateutil.parser.parse(start).strftime(DATETIME_FMT)
    params = {
        'start': formatted_start,
    }

    with metrics.record_counter(None) as counter:
        counter.tags['app'] = singer_metadata.get(metadata, tuple(),
                                                  "tap-quickbase.app_id")
        counter.tags['table'] = catalog_entry.table

        extraction_time = singer_utils.now()
        for rows_saved, row in enumerate(
                gen_request(conn, catalog_entry, params)):
            counter.increment()
            schema_dict = catalog_entry.schema.to_dict()
            rec = transform_bools(row, schema_dict)
            rec = transform_datetimes(rec, schema_dict, catalog_entry.stream)
            rec = singer.transform(
                rec, schema_dict,
                singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)

            yield singer.RecordMessage(stream=catalog_entry.stream,
                                       record=rec,
                                       time_extracted=extraction_time)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          REPLICATION_KEY,
                                          rec[REPLICATION_KEY])
            if (rows_saved + 1) % 1000 == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))
예제 #7
0
def sync_table(conn, catalog_entry, state):
    LOGGER.info("Beginning sync for {}.{} table.".format(
        catalog_entry.database, catalog_entry.table
    ))

    entity = catalog_entry.tap_stream_id
    if not entity:
        return

    start = get_start(entity)
    formatted_start = dateutil.parser.parse(start).strftime(DATETIME_FMT)
    params = {
        'start': formatted_start,
    }

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = catalog_entry.database
        counter.tags['table'] = catalog_entry.table

        extraction_time = singer_utils.now()
        for rows_saved, row in enumerate(gen_request(conn, catalog_entry, params)):
            counter.increment()
            transform_data(row, catalog_entry.schema)

            yield singer.RecordMessage(
                stream=catalog_entry.stream,
                record=row,
                time_extracted=extraction_time
            )

            state = singer.write_bookmark(
                state,
                catalog_entry.tap_stream_id,
                REPLICATION_KEY,
                row[REPLICATION_KEY]
            )
            if (rows_saved+1) % 1000 == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))
예제 #8
0
def overloaded_parse_message(msg):
    """Parse a message string into a Message object."""

    # We are not using Decimals for parsing here.
    # We recognize that exposes data to potentially
    # lossy conversions.  However, this will affect
    # very few data points and we have chosen to
    # leave conversion as is for now.
    obj = simplejson.loads(msg, use_decimal=True)
    msg_type = _required_key(obj, 'type')

    if msg_type == 'RECORD':
        time_extracted = obj.get('time_extracted')
        if time_extracted:
            try:
                time_extracted = ciso8601.parse_datetime(time_extracted)
            except Exception:
                time_extracted = None
        return singer.RecordMessage(stream=_required_key(obj, 'stream'),
                                    record=_required_key(obj, 'record'),
                                    version=obj.get('version'),
                                    time_extracted=time_extracted)

    if msg_type == 'SCHEMA':
        return singer.SchemaMessage(
            stream=_required_key(obj, 'stream'),
            schema=_required_key(obj, 'schema'),
            key_properties=_required_key(obj, 'key_properties'),
            bookmark_properties=obj.get('bookmark_properties'))

    if msg_type == 'STATE':
        return singer.StateMessage(value=_required_key(obj, 'value'))

    if msg_type == 'ACTIVATE_VERSION':
        return singer.ActivateVersionMessage(
            stream=_required_key(obj, 'stream'),
            version=_required_key(obj, 'version'))
    return None
예제 #9
0
def generate_messages(conn, catalog, state):
    for catalog_entry in catalog.streams:

        if not catalog_entry.is_selected():
            continue

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=catalog_entry.key_properties,
            bookmark_properties=[REPLICATION_KEY]
        )

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

        # Emit a state message
        yield singer.StateMessage(value=copy.deepcopy(state))
예제 #10
0
def do_sync_incremental(mysql_conn,
                        catalog_entry,
                        state,
                        columns,
                        original_state_file=''):
    LOGGER.info("Stream %s is using incremental replication",
                catalog_entry.stream)

    md_map = metadata.to_map(catalog_entry.metadata)
    replication_key = md_map.get((), {}).get('replication-key')

    if not replication_key:
        raise Exception(
            "Cannot use INCREMENTAL replication for table ({}) without a replication key."
            .format(catalog_entry.stream))

    write_schema_message(catalog_entry=catalog_entry,
                         bookmark_properties=[replication_key])

    incremental.sync_table(mysql_conn, catalog_entry, state, columns,
                           original_state_file)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #11
0
def sync_tables(conn_info, logical_streams, state, end_lsn):
    start_lsn = min([
        get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams
    ])
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    last_lsn_processed = None
    poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 60 * 30  #we are willing to poll for a total of 30 minutes without finding a record
    keep_alive_time = 10.0
    begin_ts = datetime.datetime.now()

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    with post_db.open_connection(conn_info, True) as conn:
        with conn.cursor() as cur:
            LOGGER.info(
                "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s",
                list(map(lambda s: s['tap_stream_id'], logical_streams)), slot,
                start_lsn, end_lsn, poll_total_seconds)
            try:
                cur.start_replication(slot_name=slot,
                                      decode=True,
                                      start_lsn=start_lsn)
            except psycopg2.ProgrammingError:
                raise Exception(
                    "unable to start replication with logical replication slot {}"
                    .format(slot))

            rows_saved = 0
            while True:
                poll_duration = (datetime.datetime.now() -
                                 begin_ts).total_seconds()
                if poll_duration > poll_total_seconds:
                    LOGGER.info(
                        "breaking after %s seconds of polling with no data",
                        poll_duration)
                    break

                msg = cur.read_message()
                if msg:
                    begin_ts = datetime.datetime.now()
                    if msg.data_start > end_lsn:
                        LOGGER.info("gone past end_lsn %s for run. breaking",
                                    end_lsn)
                        break

                    state = consume_message(logical_streams, state, msg,
                                            time_extracted, conn_info, end_lsn)
                    #msg has been consumed. it has been processed
                    last_lsn_processed = msg.data_start
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                else:
                    now = datetime.datetime.now()
                    timeout = keep_alive_time - (
                        now - cur.io_timestamp).total_seconds()
                    try:
                        sel = select([cur], [], [], max(0, timeout))
                        if not any(sel):
                            LOGGER.info(
                                "no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive",
                                timeout)
                            cur.send_feedback()

                    except InterruptedError:
                        pass  # recalculate timeout and continue

    if last_lsn_processed:
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream %s to last_lsn_processed %s",
                s['tap_stream_id'], last_lsn_processed)
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                          last_lsn_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
예제 #12
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
예제 #13
0
def sync_stream(client, stream, state):
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    database_name = metadata.get(md_map, (), 'database-name')

    stream_projection = load_stream_projection(stream)

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        if replication_method == 'LOG_BASED':
            if oplog.oplog_has_aged_out(client, state, tap_stream_id):
                # remove all state for stream
                # then it will do a full sync and start oplog again.
                LOGGER.info("Clearing state because Oplog has aged out")
                state.get('bookmarks', {}).pop(tap_stream_id)

            # make sure initial full table sync has been completed
            if not singer.get_bookmark(state, tap_stream_id,
                                       'initial_full_table_complete'):
                msg = 'Must complete full table sync before starting oplog replication for %s'
                LOGGER.info(msg, tap_stream_id)

                # only mark current ts in oplog on first sync so tap has a
                # starting point after the full table sync
                if singer.get_bookmark(state, tap_stream_id,
                                       'version') is None:
                    collection_oplog_ts = oplog.get_latest_ts(client)
                    oplog.update_bookmarks(state, tap_stream_id,
                                           collection_oplog_ts)

                full_table.sync_collection(client, stream, state,
                                           stream_projection)

            oplog.sync_collection(client, stream, state, stream_projection)

        elif replication_method == 'FULL_TABLE':
            full_table.sync_collection(client, stream, state,
                                       stream_projection)

        elif replication_method == 'INCREMENTAL':
            incremental.sync_collection(client, stream, state,
                                        stream_projection)
        else:
            raise Exception(
                "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \
                methods are supported (you passed {})".format(
                    replication_method))

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #14
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # NB: Flag 0x1 indicates that the binlog has been closed successfully, so we can rely on this being a complete log.
        if hasattr(binlog_event,
                   'flags') and binlog_event.flags & LOG_EVENT_BINLOG_IN_USE_F:
            state = update_bookmarks(state, binlog_streams_map,
                                     reader.log_file, reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))
예제 #15
0
def sync_table(connection, config, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                  'log_pos')

    verify_binlog_config(connection, catalog_entry)
    verify_log_file_exists(connection, catalog_entry, log_file, log_pos)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    server_id = fetch_server_id(connection)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    table_path = (catalog_entry.database, catalog_entry.stream)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0

    for binlog_event in reader:
        if reader.log_file == log_file and reader.log_pos == log_pos:
            LOGGER.info(
                "Skipping event for log_file=%s and log_pos=%s as it was processed last sync",
                reader.log_file, reader.log_pos)
            continue

        if isinstance(binlog_event, RotateEvent):
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', binlog_event.next_binlog)
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', binlog_event.position)

        elif (binlog_event.schema, binlog_event.table) == table_path:
            db_column_types = {c.name: c.type for c in binlog_event.columns}

            if isinstance(binlog_event, WriteRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            elif isinstance(binlog_event, UpdateRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['after_values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1
            elif isinstance(binlog_event, DeleteRowsEvent):
                for row in binlog_event.rows:
                    event_ts = datetime.datetime.utcfromtimestamp(
                        binlog_event.timestamp).replace(tzinfo=pytz.UTC)

                    vals = row['values']
                    vals[SDC_DELETED_AT] = event_ts

                    filtered_vals = {
                        k: v
                        for k, v in vals.items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', reader.log_file)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', reader.log_pos)

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))

    yield singer.StateMessage(value=copy.deepcopy(state))
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    state_comitted = state
    lsn_comitted = min([
        get_bookmark(state_comitted, s['tap_stream_id'], 'lsn')
        for s in logical_streams
    ])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    start_run_timestamp = datetime.datetime.utcnow()
    max_run_seconds = conn_info['max_run_seconds']
    break_at_end_lsn = conn_info['break_at_end_lsn']
    logical_poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 10800  # 3 hours
    poll_interval = 10
    poll_timestamp = None

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    version = get_pg_version(conn_info)

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True, True)
    cur = conn.cursor()

    # Set session wal_sender_timeout for PG12 and above
    if version >= 120000:
        wal_sender_timeout = 10800000  # 10800000ms = 3 hours
        LOGGER.info('Set session wal_sender_timeout = %i milliseconds',
                    wal_sender_timeout)
        cur.execute(f"SET SESSION wal_sender_timeout = {wal_sender_timeout}")

    try:
        LOGGER.info('Request wal streaming from %s to %s (slot %s)',
                    int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)
        # psycopg2 2.8.4 will send a keep-alive message to postgres every status_interval
        cur.start_replication(slot_name=slot,
                              decode=True,
                              start_lsn=start_lsn,
                              status_interval=poll_interval,
                              options={
                                  'write-in-chunks':
                                  1,
                                  'add-tables':
                                  streams_to_wal2json_tables(logical_streams)
                              })

    except psycopg2.ProgrammingError as ex:
        raise Exception(
            f"Unable to start replication with logical replication (slot {ex})"
        ) from ex

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    try:
        while True:
            # Disconnect when no data received for logical_poll_total_seconds
            # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
            poll_duration = (datetime.datetime.utcnow() -
                             lsn_received_timestamp).total_seconds()
            if poll_duration > logical_poll_total_seconds:
                LOGGER.info('Breaking - %i seconds of polling with no data',
                            poll_duration)
                break

            if datetime.datetime.utcnow() >= (
                    start_run_timestamp +
                    datetime.timedelta(seconds=max_run_seconds)):
                LOGGER.info('Breaking - reached max_run_seconds of %i',
                            max_run_seconds)
                break

            try:
                msg = cur.read_message()
            except Exception as e:
                LOGGER.error(e)
                raise

            if msg:
                if (break_at_end_lsn) and (msg.data_start > end_lsn):
                    LOGGER.info(
                        'Breaking - latest wal message %s is past end_lsn %s',
                        int_to_lsn(msg.data_start), int_to_lsn(end_lsn))
                    break

                state = consume_message(logical_streams, state, msg,
                                        time_extracted, conn_info)

                # When using wal2json with write-in-chunks, multiple messages can have the same lsn
                # This is to ensure we only flush to lsn that has completed entirely
                if lsn_currently_processing is None:
                    lsn_currently_processing = msg.data_start
                    LOGGER.info('First wal message received is %s',
                                int_to_lsn(lsn_currently_processing))

                    # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                    lsn_to_flush = lsn_comitted
                    if lsn_currently_processing < lsn_to_flush:
                        lsn_to_flush = lsn_currently_processing
                    LOGGER.info('Confirming write up to %s, flush to %s',
                                int_to_lsn(lsn_to_flush),
                                int_to_lsn(lsn_to_flush))
                    cur.send_feedback(write_lsn=lsn_to_flush,
                                      flush_lsn=lsn_to_flush,
                                      reply=True,
                                      force=True)

                elif int(msg.data_start) > lsn_currently_processing:
                    lsn_last_processed = lsn_currently_processing
                    lsn_currently_processing = msg.data_start
                    lsn_received_timestamp = datetime.datetime.utcnow()
                    lsn_processed_count = lsn_processed_count + 1
                    if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                        LOGGER.debug(
                            'Updating bookmarks for all streams to lsn = %s (%s)',
                            lsn_last_processed, int_to_lsn(lsn_last_processed))
                        for s in logical_streams:
                            state = singer.write_bookmark(
                                state, s['tap_stream_id'], 'lsn',
                                lsn_last_processed)
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))
                        lsn_processed_count = 0
            else:
                try:
                    # Wait for a second unless a message arrives
                    select([cur], [], [], 1)
                except InterruptedError:
                    pass

            # Every poll_interval, update latest comitted lsn position from the state_file
            if datetime.datetime.utcnow() >= (
                    poll_timestamp +
                    datetime.timedelta(seconds=poll_interval)):
                if lsn_currently_processing is None:
                    LOGGER.info('Waiting for first wal message')
                else:
                    LOGGER.info('Lastest wal message received was %s',
                                int_to_lsn(lsn_last_processed))
                    try:
                        with open(state_file, mode="r",
                                  encoding="utf-8") as fh:
                            state_comitted = json.load(fh)
                    except Exception:
                        LOGGER.debug('Unable to open and parse %s', state_file)
                    finally:
                        lsn_comitted = min([
                            get_bookmark(state_comitted, s['tap_stream_id'],
                                         'lsn') for s in logical_streams
                        ])
                        if (lsn_currently_processing > lsn_comitted) and (
                                lsn_comitted > lsn_to_flush):
                            lsn_to_flush = lsn_comitted
                            LOGGER.info(
                                'Confirming write up to %s, flush to %s',
                                int_to_lsn(lsn_to_flush),
                                int_to_lsn(lsn_to_flush))
                            cur.send_feedback(write_lsn=lsn_to_flush,
                                              flush_lsn=lsn_to_flush,
                                              reply=True,
                                              force=True)

                poll_timestamp = datetime.datetime.utcnow()

        # Close replication connection and cursor
        cur.close()
        conn.close()
    finally:
        if lsn_last_processed:
            if lsn_comitted > lsn_last_processed:
                lsn_last_processed = lsn_comitted
                LOGGER.info(
                    'Current lsn_last_processed %s is older than lsn_comitted %s',
                    int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))

            LOGGER.info('Updating bookmarks for all streams to lsn = %s (%s)',
                        lsn_last_processed, int_to_lsn(lsn_last_processed))

            for s in logical_streams:
                state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                              lsn_last_processed)

        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    return state
예제 #17
0
def do_sync(conn_config, catalog, default_replication_method, state):
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog.streams))
    streams.sort(key=lambda s: s.tap_stream_id)

    if currently_syncing:
        LOGGER.info("currently_syncing: %s", currently_syncing)
        currently_syncing_stream = list(
            filter(lambda s: s.tap_stream_id == currently_syncing, streams))
        other_streams = list(
            filter(lambda s: s.tap_stream_id != currently_syncing, streams))
        streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("NO currently_syncing")

    LOGGER.info("will sync: %s", list(map(lambda s: s.tap_stream_id, streams)))
    for stream in streams:
        md_map = metadata.to_map(stream.metadata)
        conn_config['dbname'] = md_map.get(()).get('database-name')
        state = singer.set_currently_syncing(state, stream.tap_stream_id)

        desired_columns = [
            c for c in stream.schema.properties.keys()
            if should_sync_column(md_map, c)
        ]
        desired_columns.sort()

        if len(desired_columns) == 0:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it',
                stream.tap_stream_id)
            continue

        replication_method = md_map.get(
            (), {}).get('replication-method', default_replication_method)
        if replication_method not in set(
            ['LOG_BASED', 'FULL_TABLE', 'INCREMENTAL']):
            raise Exception("Unrecognized replication_method {}".format(
                replication_method))

        replication_key = md_map.get((), {}).get('replication-key')

        state = clear_state_on_replication_change(state, stream.tap_stream_id,
                                                  replication_key,
                                                  replication_method)

        if replication_method == 'LOG_BASED' and md_map.get(
            (), {}).get('is-view'):
            LOGGER.warning(
                'Logical Replication is NOT supported for views. skipping stream %s',
                stream.tap_stream_id)
            continue

        if replication_method == 'LOG_BASED':
            state = do_sync_logical_replication(conn_config, stream, state,
                                                desired_columns, md_map)
        elif replication_method == 'FULL_TABLE':
            state = do_sync_full_table(conn_config, stream, state,
                                       desired_columns, md_map)
        elif replication_method == 'INCREMENTAL':
            state = do_sync_incremental(conn_config, stream, state,
                                        desired_columns, md_map)

        state = singer.set_currently_syncing(state, None)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_collection(
    collection: Collection,
    stream: Dict,
    state: Optional[Dict],
) -> None:
    """
    Syncs the stream records incrementally
    Args:
        collection: MongoDB collection instance
        stream: stream dictionary
        state: state dictionary if exists
    """
    LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {})

    replication_key_name = metadata.to_map(stream['metadata']).get(
        ()).get('replication-key')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}

    if stream_state.get('replication_key_value'):
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gt'] = common.string_to_class(
            stream_state.get('replication_key_value'),
            stream_state.get('replication_key_type'))

    # log query
    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find(find_filter,
                         sort=[(replication_key_name, pymongo.ASCENDING)
                               ]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))
            rows_saved += 1

            update_bookmark(row, state, stream['tap_stream_id'],
                            replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
예제 #19
0
def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    """..."""
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key')

    time_extracted = utils.now()

    LOGGER.info('Running %s', select_sql)
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method == 'FULL_TABLE':
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, 'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(
                        state, catalog_entry.tap_stream_id,
                        'replication_key_value',
                        record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #20
0
def sync_table(connection, catalog_entry, state):
    columns = list(catalog_entry.schema.properties.keys())
    start_date = CONFIG.get('start_date')
    formatted_start_date = None

    if not columns:
        LOGGER.warning(
            'There are no columns selected for table {}, skipping it'.format(
                catalog_entry.table))
        return

    tap_stream_id = catalog_entry.tap_stream_id
    LOGGER.info('Beginning sync for {} table'.format(tap_stream_id))
    with connection.cursor() as cursor:
        schema, table = catalog_entry.table.split('.')
        select = 'SELECT {} FROM {}.{}'.format(
            ','.join('"{}"'.format(c) for c in columns), '"{}"'.format(schema),
            '"{}"'.format(table))
        params = {}

        if start_date is not None:
            formatted_start_date = datetime.datetime.strptime(
                start_date, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC)

        replication_key = metadata.to_map(catalog_entry.metadata).get(
            (), {}).get('replication-key')
        replication_key_value = None
        bookmark_is_empty = state.get('bookmarks',
                                      {}).get(tap_stream_id) is None
        stream_version = get_stream_version(tap_stream_id, state)
        state = singer.write_bookmark(state, tap_stream_id, 'version',
                                      stream_version)
        activate_version_message = singer.ActivateVersionMessage(
            stream=catalog_entry.stream, version=stream_version)

        # If there's a replication key, we want to emit an ACTIVATE_VERSION
        # message at the beginning so the records show up right away. If
        # there's no bookmark at all for this stream, assume it's the very
        # first replication. That is, clients have never seen rows for this
        # stream before, so they can immediately acknowledge the present
        # version.
        if replication_key or bookmark_is_empty:
            yield activate_version_message

        if replication_key:
            replication_key_value = singer.get_bookmark(
                state, tap_stream_id,
                'replication_key_value') or formatted_start_date.isoformat()

        if replication_key_value is not None:
            entry_schema = catalog_entry.schema

            if entry_schema.properties[replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select += ' WHERE {} >= %(replication_key_value)s ORDER BY {} ' \
                      'ASC'.format(replication_key, replication_key)
            params['replication_key_value'] = replication_key_value

        elif replication_key is not None:
            select += ' ORDER BY {} ASC'.format(replication_key)

        time_extracted = utils.now()
        query_string = cursor.mogrify(select, params)
        LOGGER.info('Running {}'.format(query_string))
        cursor.execute(select, params)
        row = cursor.fetchone()
        rows_saved = 0

        with metrics.record_counter(None) as counter:
            counter.tags['database'] = catalog_entry.database
            counter.tags['table'] = catalog_entry.table
            while row:
                counter.increment()
                rows_saved += 1
                record_message = row_to_record(catalog_entry, stream_version,
                                               row, columns, time_extracted)
                yield record_message

                if replication_key is not None:
                    state = singer.write_bookmark(
                        state, tap_stream_id, 'replication_key_value',
                        record_message.record[replication_key])
                if rows_saved % 1000 == 0:
                    yield singer.StateMessage(value=copy.deepcopy(state))
                row = cursor.fetchone()

        if not replication_key:
            yield activate_version_message
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'version', None)

        yield singer.StateMessage(value=copy.deepcopy(state))
예제 #21
0
def sync_tables_logminer(cur, streams, state, start_scn, end_scn):

   time_extracted = utils.now()

   start_logmnr_sql = """BEGIN
                         DBMS_LOGMNR.START_LOGMNR(
                                 startScn => {},
                                 endScn => {},
                                 OPTIONS => DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG +
                                            DBMS_LOGMNR.COMMITTED_DATA_ONLY +
                                            DBMS_LOGMNR.CONTINUOUS_MINE);
                         END;""".format(start_scn, end_scn)

   LOGGER.info("Starting LogMiner for %s: %s -> %s", list(map(lambda s: s.tap_stream_id, streams)), start_scn, end_scn)
   LOGGER.info("%s",start_logmnr_sql)
   cur.execute(start_logmnr_sql)

   #mine changes
   for stream in streams:
      md_map = metadata.to_map(stream.metadata)
      desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)]
      redo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(REDO_VALUE, :{})""".format(idx+1)
                                           for idx,c in enumerate(desired_columns)])
      undo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(UNDO_VALUE, :{})""".format(idx+1)
                                           for idx,c in enumerate(desired_columns)])

      schema_name = md_map.get(()).get('schema-name')
      stream_version = get_stream_version(stream.tap_stream_id, state)
      mine_sql = """
      SELECT OPERATION, SQL_REDO, SCN, CSCN, COMMIT_TIMESTAMP,  {}, {} from v$logmnr_contents where table_name = :table_name AND seg_owner = :seg_owner AND operation in ('INSERT', 'UPDATE', 'DELETE')
      """.format(redo_value_sql_clause, undo_value_sql_clause)
      binds = [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
              [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
              [stream.table] + [schema_name]


      rows_saved = 0
      columns_for_record = desired_columns + ['scn', '_sdc_deleted_at']
      with metrics.record_counter(None) as counter:
         LOGGER.info("Examing log for table %s", stream.tap_stream_id)
         common.send_schema_message(stream, ['lsn'])
         LOGGER.info("mine_sql=%s", mine_sql)
         for op, redo, scn, cscn, commit_ts, *col_vals in cur.execute(mine_sql, binds):
            redo_vals = col_vals[0:len(desired_columns)]
            undo_vals = col_vals[len(desired_columns):]
            if op == 'INSERT' or op == 'UPDATE':
               redo_vals += [cscn, None]
               record_message = row_to_singer_message(stream, redo_vals, stream_version, columns_for_record, time_extracted)
            elif op == 'DELETE':
               undo_vals += [cscn, singer.utils.strftime(commit_ts.replace(tzinfo=pytz.UTC))]
               record_message = row_to_singer_message(stream, undo_vals, stream_version, columns_for_record, time_extracted)
            else:
               raise Exception("unrecognized logminer operation: {}".format(op))

            singer.write_message(record_message)
            rows_saved = rows_saved + 1
            counter.increment()
            state = singer.write_bookmark(state,
                                          stream.tap_stream_id,
                                          'scn',
                                          int(cscn))


            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
               singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   for s in streams:
      LOGGER.info("updating bookmark for stream %s to end_lsn %s", s.tap_stream_id, end_scn)
      state = singer.write_bookmark(state, s.tap_stream_id, 'scn', end_scn)
      singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   return state
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream['tap_stream_id'],
                                                     'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE

                fq_table_name = post_db.fully_qualified_table_name(
                    schema_name, stream['table_name'])
                xmin = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name, xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name)

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream['tap_stream_id'],
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None)

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
예제 #23
0
def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          "replication_key")

    # query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags["database"] = database_name
        counter.tags["table"] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)
            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get("replication-method")

            if replication_method in {"FULL_TABLE", "LOG_BASED"}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, "max_pk_values")

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  "last_pk_fetched",
                                                  last_pk_fetched)

            elif replication_method == "INCREMENTAL":
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  "replication_key",
                                                  replication_key)

                    state = singer.write_bookmark(
                        state,
                        catalog_entry.tap_stream_id,
                        "replication_key_value",
                        record_message.record[replication_key],
                    )
            if rows_saved % 1000 == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
예제 #24
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=stream_version)

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get('replication-key')
    replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                                'replication_key_value')
    replication_key_sql_datatype = md_map.get(
        ('properties', replication_key)).get('sql-datatype')

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='pipelinewise') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} >= '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value, replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key))
                else:
                    #if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key))

                LOGGER.info('select statement: %s with itersize %s',
                            select_sql, cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns,
                        time_extracted, md_map)

                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
                    #event worse would be allowing the NULL value to enter into the state
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state, stream['tap_stream_id'],
                            'replication_key_value',
                            record_message.record[replication_key])

                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    return state
예제 #25
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def sync_tables(conn_info, logical_streams, state, end_lsn):
    start_lsn = 0
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    last_lsn_processed = None
    poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 60 * 5  #we are willing to poll for a total of 3 minutes without finding a record
    keep_alive_time = 0.1
    begin_ts = datetime.datetime.now()
    add_tables = []

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    for s in logical_streams:
        add_tables.append("{}.{}".format(
            s["metadata"][0]["metadata"]["schema-name"], s["table_name"]))

    if "lsn_to_flush" in state:
        start_lsn = state["lsn_to_flush"]

    conn = post_db.open_connection(conn_info, True)
    with conn.cursor() as cur:
        LOGGER.info(
            "Starting Logical Replication from slot %s: %s -> %s. poll_total_seconds: %s",
            slot, start_lsn, end_lsn, poll_total_seconds)
        LOGGER.info("Starting at bookmark: %s",
                    list(map(lambda s: s['tap_stream_id'], logical_streams)))

        replication_params = {
            "slot_name": slot,
            "decode": True,
            "start_lsn": start_lsn,
            "options": {
                "add-tables": ", ".join(add_tables)
            }
        }
        message_format = conn_info.get("wal2json_message_format") or "1"
        if message_format == "2":
            LOGGER.info("Using wal2json format-version 2")
            replication_params["options"]["format-version"] = 2
            replication_params["options"]["include-timestamp"] = True

        try:
            cur.start_replication(**replication_params)
        except psycopg2.ProgrammingError:
            raise Exception(
                "unable to start replication with logical replication slot {}".
                format(slot))

        # initial flush lsn from the previous run
        if "lsn_to_flush" in state:
            LOGGER.info("Flushing lsn %s from from previous job run!",
                        str(state["lsn_to_flush"]))
            cur.send_feedback(flush_lsn=state["lsn_to_flush"])

        rows_saved = 0
        idle_count = 0
        while True:
            poll_duration = (datetime.datetime.now() -
                             begin_ts).total_seconds()
            if poll_duration > poll_total_seconds:
                LOGGER.info(
                    "breaking after %s seconds of polling with no data",
                    poll_duration)
                if not last_lsn_processed:
                    state["lsn_to_flush"] = end_lsn
                break

            read_message_start = datetime.datetime.now()
            msg = cur.read_message()
            COUNTER['read_message'] += (datetime.datetime.now() -
                                        read_message_start).total_seconds()
            if msg:
                idle_count = 0
                begin_ts = datetime.datetime.now()
                if msg.data_start > end_lsn:
                    LOGGER.info("gone past end_lsn %s for run. breaking",
                                end_lsn)
                    break

                state = consume_message(logical_streams,
                                        state,
                                        msg,
                                        time_extracted,
                                        conn_info,
                                        end_lsn,
                                        message_format=message_format)
                #msg has been consumed. it has been processed
                last_lsn_processed = msg.data_start
                rows_saved = rows_saved + 1
                if rows_saved % COUNTER_PRINT_PERIOD == 0:
                    LOGGER.info(
                        "Rows saved = %s, Processed messages counter: %s",
                        str(rows_saved), str(COUNTER))
                if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug("Sending state to loader: %s", str(state))
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))
            else:
                idle_count += 1
                if idle_count > 100:
                    idle_count = 0
                    tmp_poll_duration = (datetime.datetime.now() -
                                         begin_ts).total_seconds()
                    LOGGER.info(
                        "No data for ~10 seconds (%s seconds from start). sending feedback to server with NO flush_lsn. just a keep-alive",
                        tmp_poll_duration)
                    cur.send_feedback()
                else:
                    time.sleep(keep_alive_time)

    bookmark_lsn = last_lsn_processed if last_lsn_processed else end_lsn
    LOGGER.info("Finished processing messages - counter: %s", str(COUNTER))
    for s in logical_streams:
        LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s",
                    s['tap_stream_id'], bookmark_lsn)
        state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                      bookmark_lsn)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
def sync_table(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream.tap_stream_id,
                                         'version')
    # If there was no bookmark for stream_version, it is the first time
    # this table is being sync'd, so get a new version, write to
    # state
    if stream_version is None:
        stream_version = int(time.time() * 1000)
        state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                      stream_version)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=stream_version)
    singer.write_message(activate_version_message)

    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table

    replication_key = md.get((), {}).get('replication-key')
    #escaped_replication_key = common.prepare_columns_sql(stream, replication_key)
    replication_key_value = singer.get_bookmark(state, stream.tap_stream_id,
                                                'replication_key_value')
    replication_key_sql_datatype = md.get(
        ('properties', replication_key)).get('sql-datatype')

    with metrics.record_counter(None) as counter:
        if replication_key_value:
            LOGGER.info("Resuming Incremental replication from %s = %s",
                        replication_key, replication_key_value)
            casted_where_clause_arg = common.prepare_where_clause_arg(
                replication_key_value, replication_key_sql_datatype)

            select_sql = """SELECT {}
                                FROM {}.{}
                               WHERE {} >= {}
                               ORDER BY {} ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema, escaped_table,
                                           replication_key,
                                           casted_where_clause_arg,
                                           replication_key)
        else:
            select_sql = """SELECT {}
                                FROM {}.{}
                               ORDER BY {} ASC
                               """.format(','.join(escaped_columns),
                                          escaped_schema, escaped_table,
                                          replication_key)

        rows_saved = 0
        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, stream_version, desired_columns, time_extracted)

            singer.write_message(record_message)
            rows_saved = rows_saved + 1

            #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
            #event worse would be allowing the NULL value to enter into the state
            if record_message.record[replication_key] is not None:
                state = singer.write_bookmark(
                    state, stream.tap_stream_id, 'replication_key_value',
                    record_message.record[replication_key])

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            counter.increment()

    cur.close()
    connection.close()
    return state
예제 #28
0
def sync_collection(client, stream, state, stream_projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting oplog sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')
    collection_name = stream.get("table_name")
    stream_state = state.get('bookmarks', {}).get(tap_stream_id)

    oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'],
                                   stream_state['oplog_ts_inc'])

    # Write activate version message
    version = common.get_stream_version(tap_stream_id, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=version)
    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    start_time = time.time()

    oplog_query = {'ts': {'$gte': oplog_ts}}

    projection = transform_projection(stream_projection)

    oplog_replay = stream_projection is None

    LOGGER.info(
        'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s',
        tap_stream_id, oplog_query, projection, oplog_replay)

    update_buffer = set()

    # consider adding oplog_replay, but this would require removing the projection
    # default behavior is a non_tailable cursor but we might want a tailable one
    # regardless of whether its long lived or not.
    with client.local.oplog.rs.find(oplog_query,
                                    projection,
                                    sort=[('$natural', pymongo.ASCENDING)],
                                    oplog_replay=oplog_replay) as cursor:
        for row in cursor:
            # assertions that mongo is respecing the ts query and sort order
            if row.get('ts') and row.get('ts') < oplog_ts:
                raise common.MongoAssertionException(
                    "Mongo is not honoring the query param")
            if row.get('ts') and row.get('ts') < timestamp.Timestamp(
                    stream_state['oplog_ts_time'],
                    stream_state['oplog_ts_inc']):
                raise common.MongoAssertionException(
                    "Mongo is not honoring the sort ascending param")

            if row.get('ns') != '{}.{}'.format(database_name, collection_name):
                if row.get('ts'):
                    state = update_bookmarks(state, tap_stream_id, row['ts'])
                continue

            row_op = row['op']
            if row_op == 'i':

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            elif row_op == 'u':
                update_buffer.add(row['o2']['_id'])

            elif row_op == 'd':

                # remove update from buffer if that document has been deleted
                if row['o']['_id'] in update_buffer:
                    update_buffer.remove(row['o']['_id'])

                # Delete ops only contain the _id of the row deleted
                row['o'][SDC_DELETED_AT] = row['ts']

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            state = update_bookmarks(state, tap_stream_id, row['ts'])

            # flush buffer if it has filled up
            if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH:
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

            # write state every UPDATE_BOOKMARK_PERIOD messages
            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                # flush buffer before writing state
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

                # write state
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        # flush buffer if finished with oplog
        for buffered_row in flush_buffer(client, update_buffer,
                                         stream_projection, database_name,
                                         collection_name):
            record_message = common.row_to_singer_record(
                stream, buffered_row, version, time_extracted)

            singer.write_message(record_message)
            rows_saved += 1

    common.COUNTS[tap_stream_id] += rows_saved
    common.TIMES[tap_stream_id] += time.time() - start_time
    LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
예제 #29
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version IFF we do not have an xmin in our state
    #the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream.tap_stream_id, 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream.tap_stream_id,
                                                     'version')

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                xmin = singer.get_bookmark(state, stream.tap_stream_id, 'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) < age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table), xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table))

                LOGGER.info("select %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0
                rec = cur.fetchone()
                while rec is not None:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state, stream.tap_stream_id,
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()
                    rec = cur.fetchone()

    #once we have completed the full table replication, discard the xmin bookmark.
    #the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', None)

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    lsn_comitted = min([
        get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams
    ])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300
    poll_interval = 10
    poll_timestamp = None

    selected_tables = []
    for s in logical_streams:
        selected_tables.append("{}.{}".format(
            s['metadata'][0]['metadata']['schema-name'], s['table_name']))

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True)
    cur = conn.cursor()

    try:
        LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format(
            datetime.datetime.utcnow(), int_to_lsn(start_lsn),
            int_to_lsn(end_lsn), slot))
        cur.start_replication(slot_name=slot,
                              decode=True,
                              start_lsn=start_lsn,
                              options={
                                  'write-in-chunks': 1,
                                  'add-tables': ','.join(selected_tables)
                              })
    except psycopg2.ProgrammingError:
        raise Exception(
            "Unable to start replication with logical replication (slot {})".
            format(slot))

    # Emulate some behaviour of pg_recvlogical
    LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format(
        datetime.datetime.utcnow()))
    cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True)
    time.sleep(1)

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    while True:
        # Disconnect when no data received for logical_poll_total_seconds
        # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
        poll_duration = (datetime.datetime.utcnow() -
                         lsn_received_timestamp).total_seconds()
        if poll_duration > logical_poll_total_seconds:
            LOGGER.info(
                "{} : Breaking - {} seconds of polling with no data".format(
                    datetime.datetime.utcnow(), poll_duration))
            break

        try:
            msg = cur.read_message()
        except Exception as e:
            LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e))
            raise

        if msg:
            if msg.data_start > end_lsn:
                LOGGER.info(
                    "{} : Breaking - current {} is past end_lsn {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(msg.data_start),
                        int_to_lsn(end_lsn)))
                break

            state = consume_message(logical_streams, state, msg,
                                    time_extracted, conn_info, end_lsn)

            # When using wal2json with write-in-chunks, multiple messages can have the same lsn
            # This is to ensure we only flush to lsn that has completed entirely
            if lsn_currently_processing is None:
                lsn_currently_processing = msg.data_start
                LOGGER.info("{} : First message received is {} at {}".format(
                    datetime.datetime.utcnow(),
                    int_to_lsn(lsn_currently_processing),
                    datetime.datetime.utcnow()))

                # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush),
                        int_to_lsn(lsn_to_flush)))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)

            elif (int(msg.data_start) > lsn_currently_processing):
                lsn_last_processed = lsn_currently_processing
                lsn_currently_processing = msg.data_start
                lsn_received_timestamp = datetime.datetime.utcnow()
                lsn_processed_count = lsn_processed_count + 1
                if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))
                    lsn_processed_count = 0

        # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL
        if datetime.datetime.utcnow() >= (
                poll_timestamp + datetime.timedelta(seconds=poll_interval)):
            if lsn_currently_processing is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            elif state_file is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            else:
                # Read lsn_comitted currently captured in state file on disk
                lsn_comitted = min([
                    get_bookmark(utils.load_json(state_file),
                                 s['tap_stream_id'], 'lsn')
                    for s in logical_streams
                ])
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {} (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)
            poll_timestamp = datetime.datetime.utcnow()

    # Close replication connection and cursor
    cur.close()
    conn.close()

    if lsn_last_processed:
        if lsn_comitted > lsn_last_processed:
            lsn_last_processed = lsn_comitted
            LOGGER.info(
                "Current lsn_last_processed {} is older than lsn_comitted {}".
                format(int_to_lsn(lsn_last_processed),
                       int_to_lsn(lsn_comitted)))
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream {} to lsn = {} ({})".format(
                    s['tap_stream_id'], lsn_last_processed,
                    int_to_lsn(lsn_last_processed)))
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                          lsn_last_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state