def locate_replication_slot(conn_info): with post_db.open_connection(conn_info, False) as conn: with conn.cursor() as cur: db_specific_slot = "stitch_{}".format(conn_info["dbname"]) cur.execute( "SELECT * FROM pg_replication_slots WHERE slot_name = %s AND plugin = %s", (db_specific_slot, "wal2json"), ) if len(cur.fetchall()) == 1: LOGGER.info("using pg_replication_slot %s", db_specific_slot) return db_specific_slot cur.execute( """SELECT * FROM pg_replication_slots WHERE slot_name = 'stitch' AND plugin = 'wal2json'""" ) if len(cur.fetchall()) == 1: LOGGER.info("using pg_replication_slot 'stitch'") return "stitch" raise Exception( "Unable to find replication slot (stitch || {} with wal2json".format( db_specific_slot ) )
def row_to_singer_message( stream, row, version, columns, time_extracted, md_map, conn_info ): row_to_persist = () md_map[("properties", "_sdc_deleted_at")] = { "sql-datatype": "timestamp with time zone" } md_map[("properties", "_sdc_lsn")] = {"sql-datatype": "character varying"} for idx, elem in enumerate(row): sql_datatype = md_map.get(("properties", columns[idx])).get("sql-datatype") if not sql_datatype: LOGGER.info("No sql-datatype found for stream %s: %s", stream, columns[idx]) raise Exception("Unable to find sql-datatype for stream {}".format(stream)) cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info) row_to_persist += (cleaned_elem,) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), record=rec, version=version, time_extracted=time_extracted, )
def open_connection(conn_config, logical_replication=False): cfg = { "host": conn_config["host"], "dbname": conn_config["dbname"], "user": conn_config["user"], "password": conn_config["password"], "port": conn_config["port"], "connect_timeout": 30, } if conn_config.get("sslmode"): cfg["sslmode"] = conn_config["sslmode"] if cfg["sslmode"] == "verify-ca": cfg["sslrootcert"] = conn_config.get("sslrootcert") cfg["sslcert"] = conn_config.get("sslcert") cfg["sslkey"] = conn_config.get("sslkey") if logical_replication: cfg["connection_factory"] = psycopg2.extras.LogicalReplicationConnection LOGGER.info("Trying to connect with config = %s", str(cfg)) conn = psycopg2.connect(**cfg) LOGGER.info("Connected with sslmode = %s", conn.get_dsn_parameters().get("sslmode", "unknown")) return conn
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map): replication_key = md_map.get((), {}).get("replication-key") LOGGER.info( "Stream %s is using incremental replication with replication key %s", stream["tap_stream_id"], replication_key, ) stream_state = state.get("bookmarks", {}).get(stream["tap_stream_id"]) illegal_bk_keys = set(stream_state.keys()).difference( set([ "replication_key", "replication_key_value", "version", "last_replication_method" ])) if len(illegal_bk_keys) != 0: raise Exception( "invalid keys found in state: {}".format(illegal_bk_keys)) state = singer.write_bookmark(state, stream["tap_stream_id"], "replication_key", replication_key) sync_common.send_schema_message(stream, [replication_key]) state = incremental.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def sync_view(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream["tap_stream_id"], "version") is None nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream["tap_stream_id"], "version", nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get("schema-name") escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version, ) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor") as cur: cur.itersize = post_db.cursor_iter_size select_sql = "SELECT {} FROM {}".format( ",".join(escaped_columns), post_db.fully_qualified_table_name(schema_name, stream["table_name"]), ) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() # always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def get_pg_version(cur): cur.execute("SELECT version()") res = cur.fetchone()[0] version_match = re.match(r"PostgreSQL (\d+)", res) if not version_match: raise Exception("unable to determine PostgreSQL version from {}".format(res)) version = int(version_match.group(1)) LOGGER.info("Detected PostgresSQL version: %s", version) return version
def do_sync_full_table(conn_config, stream, state, desired_columns, md_map): LOGGER.info("Stream %s is using full_table replication", stream["tap_stream_id"]) sync_common.send_schema_message(stream, []) if md_map.get((), {}).get("is-view"): state = full_table.sync_view(conn_config, stream, state, desired_columns, md_map) else: state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def add_automatic_properties(stream, conn_config): stream["schema"]["properties"]["_sdc_deleted_at"] = { "type": ["null", "string"], "format": "date-time", } if conn_config.get("debug_lsn"): LOGGER.info("debug_lsn is ON") stream["schema"]["properties"]["_sdc_lsn"] = {"type": ["null", "string"]} else: LOGGER.info("debug_lsn is OFF") return stream
def fetch_max_replication_key(conn_config, replication_key, schema_name, table_name): with post_db.open_connection(conn_config, False) as conn: with conn.cursor() as cur: max_key_sql = """SELECT max({}) FROM {}""".format( post_db.prepare_columns_sql(replication_key), post_db.fully_qualified_table_name(schema_name, table_name), ) LOGGER.info("determine max replication key value: %s", max_key_sql) cur.execute(max_key_sql) max_key = cur.fetchone()[0] LOGGER.info("max replication key value: %s", max_key) return max_key
def do_sync(conn_config, catalog, default_replication_method, state): currently_syncing = singer.get_currently_syncing(state) streams = list(filter(is_selected_via_metadata, catalog["streams"])) streams.sort(key=lambda s: s["tap_stream_id"]) LOGGER.info("Selected streams: %s ", list(map(lambda s: s["tap_stream_id"], streams))) if any_logical_streams(streams, default_replication_method): LOGGER.info("Use of logical replication requires fetching an lsn...") end_lsn = logical_replication.fetch_current_lsn(conn_config) LOGGER.info("End LSN: %s ", end_lsn) else: end_lsn = None sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams( streams, state, default_replication_method) if currently_syncing: LOGGER.info("found currently_syncing: %s", currently_syncing) currently_syncing_stream = list( filter(lambda s: s["tap_stream_id"] == currently_syncing, traditional_streams)) if currently_syncing_stream is None: LOGGER.warning( "Can't find currently_syncing(%s) in selected traditional streams(%s). Will ignore", currently_syncing, list(map(lambda s: s["tap_stream_id"], traditional_streams)), ) other_streams = list( filter(lambda s: s["tap_stream_id"] != currently_syncing, traditional_streams)) traditional_streams = currently_syncing_stream + other_streams else: LOGGER.info("No currently_syncing found") for stream in traditional_streams: state = sync_traditional_stream( conn_config, stream, state, sync_method_lookup[stream["tap_stream_id"]], end_lsn) logical_streams.sort(key=lambda s: metadata.to_map(s["metadata"]).get( ()).get("database-name")) for dbname, streams in itertools.groupby( logical_streams, lambda s: metadata.to_map(s["metadata"]).get( ()).get("database-name")): conn_config["dbname"] = dbname state = sync_logical_streams(conn_config, list(streams), state, end_lsn) return state
def attempt_connection_to_db(conn_config, dbname): nascent_config = copy.deepcopy(conn_config) nascent_config["dbname"] = dbname LOGGER.info("(%s) Testing connectivity...", dbname) try: conn = post_db.open_connection(nascent_config) LOGGER.info("(%s) connectivity verified", dbname) conn.close() return True except Exception as err: LOGGER.warning( 'Unable to connect to %s. This is OK if you do not replicate from this database: "%s"', dbname, err, ) return False
def sync_logical_streams(conn_config, logical_streams, state, end_lsn): if logical_streams: LOGGER.info( "Pure Logical Replication upto lsn %s for (%s)", end_lsn, list(map(lambda s: s["tap_stream_id"], logical_streams)), ) logical_streams = list( map( lambda s: logical_replication.add_automatic_properties( s, conn_config), logical_streams, )) state = logical_replication.sync_tables(conn_config, logical_streams, state, end_lsn) return state
def do_discovery(conn_config): all_streams = [] with post_db.open_connection(conn_config) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor") as cur: cur.itersize = post_db.cursor_iter_size sql = """SELECT datname FROM pg_database WHERE datistemplate = false AND datname != 'rdsadmin'""" if conn_config.get("filter_dbs"): sql = post_db.filter_dbs_sql_clause(sql, conn_config["filter_dbs"]) LOGGER.info("Running DB discovery: %s with itersize %s", sql, cur.itersize) cur.execute(sql) found_dbs = (row[0] for row in cur.fetchall()) filter_dbs = filter( lambda dbname: attempt_connection_to_db(conn_config, dbname), found_dbs) for db_row in filter_dbs: dbname = db_row LOGGER.info("Discovering db %s", dbname) conn_config["dbname"] = dbname with post_db.open_connection(conn_config) as conn: db_streams = discover_db(conn) all_streams = all_streams + db_streams if len(all_streams) == 0: raise RuntimeError( "0 tables were discovered across the entire cluster") dump_catalog(all_streams) return all_streams
def open_connection(conn_config, logical_replication=False): cfg = { "host": conn_config["host"], "dbname": conn_config["dbname"], "user": conn_config["user"], "password": conn_config["password"], "port": conn_config["port"], "connect_timeout": 30, } if conn_config.get("sslmode"): cfg["sslmode"] = conn_config["sslmode"] if logical_replication: cfg["connection_factory"] = psycopg2.extras.LogicalReplicationConnection conn = psycopg2.connect(**cfg) LOGGER.info( "Connected with sslmode = %s", conn.get_dsn_parameters().get("sslmode", "unknown"), ) return conn
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn): LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream["tap_stream_id"], sync_method) md_map = metadata.to_map(stream["metadata"]) conn_config["dbname"] = md_map.get(()).get("database-name") desired_columns = [ c for c in stream["schema"]["properties"].keys() if sync_common.should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( "There are no columns selected for stream %s, skipping it", stream["tap_stream_id"]) return state register_type_adapters(conn_config) if sync_method == "full": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif sync_method == "incremental": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) elif sync_method == "logical_initial": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) LOGGER.info("Performing initial full table sync") state = singer.write_bookmark(state, stream["tap_stream_id"], "lsn", end_lsn) sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None) elif sync_method == "logical_initial_interrupted": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) LOGGER.info( "Initial stage of full table sync was interrupted. resuming...") sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception("unknown sync method {} for stream {}".format( sync_method, stream["tap_stream_id"])) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_tables(conn_info, logical_streams, state, end_lsn): start_lsn = min( [get_bookmark(state, s["tap_stream_id"], "lsn") for s in logical_streams] ) time_extracted = utils.now() slot = locate_replication_slot(conn_info) last_lsn_processed = None poll_total_seconds = ( conn_info["logical_poll_total_seconds"] or 60 * 30 ) # we are willing to poll for a total of 30 minutes without finding a record keep_alive_time = 10.0 begin_ts = datetime.datetime.now() for s in logical_streams: sync_common.send_schema_message(s, ["lsn"]) with post_db.open_connection(conn_info, True) as conn: with conn.cursor() as cur: LOGGER.info( "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s", list(map(lambda s: s["tap_stream_id"], logical_streams)), slot, start_lsn, end_lsn, poll_total_seconds, ) try: cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot {}".format( slot ) ) rows_saved = 0 while True: poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() if poll_duration > poll_total_seconds: LOGGER.info( "breaking after %s seconds of polling with no data", poll_duration, ) break msg = cur.read_message() if msg: begin_ts = datetime.datetime.now() if msg.data_start > end_lsn: LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) break state = consume_message( logical_streams, state, msg, time_extracted, conn_info, end_lsn ) # msg has been consumed. it has been processed last_lsn_processed = msg.data_start rows_saved = rows_saved + 1 if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state)) ) else: now = datetime.datetime.now() timeout = keep_alive_time - (now - cur.io_timestamp).total_seconds() try: sel = select([cur], [], [], max(0, timeout)) if not any(sel): LOGGER.info( ( "no data for {} seconds. sending feedback to server with NO " "flush_lsn. just a keep-alive" ).format(timeout) ) cur.send_feedback() except InterruptedError: pass # recalculate timeout and continue if last_lsn_processed: for s in logical_streams: LOGGER.info( "updating bookmark for stream %s to last_lsn_processed %s", s["tap_stream_id"], last_lsn_processed, ) state = singer.write_bookmark( state, s["tap_stream_id"], "lsn", last_lsn_processed ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version") if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream["tap_stream_id"], "version", stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get("schema-name") escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version ) singer.write_message(activate_version_message) replication_key = md_map.get((), {}).get("replication-key") replication_key_value = singer.get_bookmark( state, stream["tap_stream_id"], "replication_key_value" ) replication_key_sql_datatype = md_map.get(("properties", replication_key)).get("sql-datatype") hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to client_encoding in postgresql.conf. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor( cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor" ) as cur: cur.itersize = post_db.cursor_iter_size LOGGER.info("Beginning new incremental replication sync %s", stream_version) if replication_key_value: select_sql = """SELECT {} FROM {} WHERE {} > '{}'::{} ORDER BY {} ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name(schema_name, stream["table_name"]), post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, post_db.prepare_columns_sql(replication_key), ) else: # if not replication_key_value select_sql = """SELECT {} FROM {} ORDER BY {} ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name(schema_name, stream["table_name"]), post_db.prepare_columns_sql(replication_key), ) LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) LOGGER.info("Query returned - processing results") rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, stream_version, desired_columns, time_extracted, md_map ) singer.write_message(record_message) rows_saved = rows_saved + 1 # Picking a replication_key with NULL values will result in it ALWAYS being # synced which is not great. Even worse would be allowing the NULL value to # enter into the state. if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream["tap_stream_id"], "replication_key_value", record_message.record[replication_key], ) if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) counter.increment() LOGGER.info("Incremental table tap complete") return state
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream["tap_stream_id"], "version") is None # pick a new table version IFF we do not have an xmin in our state # the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream["tap_stream_id"], "xmin") is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version") state = singer.write_bookmark(state, stream["tap_stream_id"], "version", nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get("schema-name") escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version, ) if first_run: singer.write_message(activate_version_message) hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to client_encoding in postgresql.conf. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor") as cur: cur.itersize = post_db.cursor_iter_size xmin = singer.get_bookmark(state, stream["tap_stream_id"], "xmin") if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin, ) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) <= age('{}'::xid) ORDER BY xmin::text::bigint ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), xmin, ) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text::bigint ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), ) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) LOGGER.info("Query returned - processing results") rows_saved = 0 for rec in cur: xmin = rec["xmin"] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", xmin) rows_saved = rows_saved + 1 if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() LOGGER.info( "Processing complete - saved {} rows".format(rows_saved)) # once we have completed the full table replication, discard the xmin bookmark. # the xmin bookmark only comes into play when a full table replication is interrupted LOGGER.info("Writing bookmark") state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None) # always send the activate version whether first run or subsequent LOGGER.info("Sending activate version message") singer.write_message(activate_version_message) LOGGER.info("Full table tap complete") return state