示例#1
0
def open_connection(conn_config, logical_replication=False):
    cfg = {
        "host": conn_config["host"],
        "dbname": conn_config["dbname"],
        "user": conn_config["user"],
        "password": conn_config["password"],
        "port": conn_config["port"],
        "connect_timeout": 30,
    }

    if conn_config.get("sslmode"):
        cfg["sslmode"] = conn_config["sslmode"]
        if cfg["sslmode"] == "verify-ca":
            cfg["sslrootcert"] = conn_config.get("sslrootcert")
            cfg["sslcert"] = conn_config.get("sslcert")
            cfg["sslkey"] = conn_config.get("sslkey")

    if logical_replication:
        cfg["connection_factory"] = psycopg2.extras.LogicalReplicationConnection

    LOGGER.info("Trying to connect with config = %s", str(cfg))
    conn = psycopg2.connect(**cfg)
    LOGGER.info("Connected with sslmode = %s",
                conn.get_dsn_parameters().get("sslmode", "unknown"))

    return conn
def locate_replication_slot(conn_info):
    with post_db.open_connection(conn_info, False) as conn:
        with conn.cursor() as cur:
            db_specific_slot = "stitch_{}".format(conn_info["dbname"])
            cur.execute(
                "SELECT * FROM pg_replication_slots WHERE slot_name = %s AND plugin = %s",
                (db_specific_slot, "wal2json"),
            )
            if len(cur.fetchall()) == 1:
                LOGGER.info("using pg_replication_slot %s", db_specific_slot)
                return db_specific_slot

            cur.execute(
                """SELECT * FROM pg_replication_slots
                WHERE slot_name = 'stitch'
                AND plugin = 'wal2json'"""
            )
            if len(cur.fetchall()) == 1:
                LOGGER.info("using pg_replication_slot 'stitch'")
                return "stitch"

            raise Exception(
                "Unable to find replication slot (stitch || {} with wal2json".format(
                    db_specific_slot
                )
            )
示例#3
0
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map):
    replication_key = md_map.get((), {}).get("replication-key")
    LOGGER.info(
        "Stream %s is using incremental replication with replication key %s",
        stream["tap_stream_id"],
        replication_key,
    )

    stream_state = state.get("bookmarks", {}).get(stream["tap_stream_id"])
    illegal_bk_keys = set(stream_state.keys()).difference(
        set([
            "replication_key", "replication_key_value", "version",
            "last_replication_method"
        ]))
    if len(illegal_bk_keys) != 0:
        raise Exception(
            "invalid keys found in state: {}".format(illegal_bk_keys))

    state = singer.write_bookmark(state, stream["tap_stream_id"],
                                  "replication_key", replication_key)

    sync_common.send_schema_message(stream, [replication_key])
    state = incremental.sync_table(conn_config, stream, state, desired_columns,
                                   md_map)

    return state
def row_to_singer_message(
    stream, row, version, columns, time_extracted, md_map, conn_info
):
    row_to_persist = ()
    md_map[("properties", "_sdc_deleted_at")] = {
        "sql-datatype": "timestamp with time zone"
    }
    md_map[("properties", "_sdc_lsn")] = {"sql-datatype": "character varying"}

    for idx, elem in enumerate(row):
        sql_datatype = md_map.get(("properties", columns[idx])).get("sql-datatype")

        if not sql_datatype:
            LOGGER.info("No sql-datatype found for stream %s: %s", stream, columns[idx])
            raise Exception("Unable to find sql-datatype for stream {}".format(stream))

        cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info)
        row_to_persist += (cleaned_elem,)

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        record=rec,
        version=version,
        time_extracted=time_extracted,
    )
示例#5
0
def numeric_scale(c):
    if c.numeric_scale is None:
        return MAX_SCALE
    if c.numeric_scale > MAX_SCALE:
        LOGGER.warning("capping decimal scale to 38.  THIS MAY CAUSE TRUNCATION")
        return MAX_SCALE

    return c.numeric_scale
示例#6
0
def numeric_precision(c):
    if c.numeric_precision is None:
        return MAX_PRECISION

    if c.numeric_precision > MAX_PRECISION:
        LOGGER.warning("capping decimal precision to 100.  THIS MAY CAUSE TRUNCATION")
        return MAX_PRECISION

    return c.numeric_precision
示例#7
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size
                select_sql = "SELECT {} FROM {}".format(
                    ",".join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream["table_name"]),
                )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
def get_pg_version(cur):
    cur.execute("SELECT version()")
    res = cur.fetchone()[0]
    version_match = re.match(r"PostgreSQL (\d+)", res)
    if not version_match:
        raise Exception("unable to determine PostgreSQL version from {}".format(res))

    version = int(version_match.group(1))
    LOGGER.info("Detected PostgresSQL version: %s", version)
    return version
示例#9
0
def do_sync_full_table(conn_config, stream, state, desired_columns, md_map):
    LOGGER.info("Stream %s is using full_table replication",
                stream["tap_stream_id"])
    sync_common.send_schema_message(stream, [])
    if md_map.get((), {}).get("is-view"):
        state = full_table.sync_view(conn_config, stream, state,
                                     desired_columns, md_map)
    else:
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
    return state
def add_automatic_properties(stream, conn_config):
    stream["schema"]["properties"]["_sdc_deleted_at"] = {
        "type": ["null", "string"],
        "format": "date-time",
    }
    if conn_config.get("debug_lsn"):
        LOGGER.info("debug_lsn is ON")
        stream["schema"]["properties"]["_sdc_lsn"] = {"type": ["null", "string"]}
    else:
        LOGGER.info("debug_lsn is OFF")

    return stream
示例#11
0
def fetch_max_replication_key(conn_config, replication_key, schema_name, table_name):
    with post_db.open_connection(conn_config, False) as conn:
        with conn.cursor() as cur:
            max_key_sql = """SELECT max({}) FROM {}""".format(
                post_db.prepare_columns_sql(replication_key),
                post_db.fully_qualified_table_name(schema_name, table_name),
            )
            LOGGER.info("determine max replication key value: %s", max_key_sql)
            cur.execute(max_key_sql)
            max_key = cur.fetchone()[0]
            LOGGER.info("max replication key value: %s", max_key)
            return max_key
示例#12
0
def main_impl():
    args = utils.parse_args(REQUIRED_CONFIG_KEYS)
    conn_config = {
        "host":
        args.config["host"],
        "user":
        args.config["user"],
        "password":
        args.config["password"],
        "port":
        args.config["port"],
        "dbname":
        args.config["dbname"],
        "filter_dbs":
        args.config.get("filter_dbs"),
        "emit_state_every_n_rows":
        int(args.config.get("emit_state_every_n_rows", 1000)),
        "debug_lsn":
        args.config.get("debug_lsn") == "true",
        "logical_poll_total_seconds":
        float(args.config.get("logical_poll_total_seconds", 0)),
        "sslmode":
        args.config.get("sslmode"),
        "sslrootcert":
        args.config.get("sslrootcert"),
        "sslcert":
        args.config.get("sslcert"),
        "sslkey":
        args.config.get("sslkey"),
    }

    LOGGER.setLevel(
        getattr(logging,
                args.config.get("loglevel", "INFO").upper(), logging.INFO))

    if args.config.get("ssl") == "true":
        conn_config["sslmode"] = "require"

    post_db.cursor_iter_size = int(args.config.get("itersize", "20000"))

    post_db.include_schemas_in_destination_stream_name = (args.config.get(
        "include_schemas_in_destination_stream_name") == "true")

    if args.discover:
        do_discovery(conn_config)
    elif args.properties:
        state = args.state
        do_sync(conn_config, args.properties,
                args.config.get("default_replication_method"), state)
    else:
        LOGGER.error("No properties were selected")
示例#13
0
def sync_logical_streams(conn_config, logical_streams, state, end_lsn):
    if logical_streams:
        LOGGER.info(
            "Pure Logical Replication upto lsn %s for (%s)",
            end_lsn,
            list(map(lambda s: s["tap_stream_id"], logical_streams)),
        )
        logical_streams = list(
            map(
                lambda s: logical_replication.add_automatic_properties(
                    s, conn_config),
                logical_streams,
            ))

        state = logical_replication.sync_tables(conn_config, logical_streams,
                                                state, end_lsn)

    return state
示例#14
0
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn):
    LOGGER.info("Beginning sync of stream(%s) with sync method(%s)",
                stream["tap_stream_id"], sync_method)
    md_map = metadata.to_map(stream["metadata"])
    conn_config["dbname"] = md_map.get(()).get("database-name")
    desired_columns = [
        c for c in stream["schema"]["properties"].keys()
        if sync_common.should_sync_column(md_map, c)
    ]
    desired_columns.sort()

    if len(desired_columns) == 0:
        LOGGER.warning(
            "There are no columns selected for stream %s, skipping it",
            stream["tap_stream_id"])
        return state

    register_type_adapters(conn_config)

    if sync_method == "full":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        state = do_sync_full_table(conn_config, stream, state, desired_columns,
                                   md_map)
    elif sync_method == "incremental":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        state = do_sync_incremental(conn_config, stream, state,
                                    desired_columns, md_map)
    elif sync_method == "logical_initial":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        LOGGER.info("Performing initial full table sync")
        state = singer.write_bookmark(state, stream["tap_stream_id"], "lsn",
                                      end_lsn)

        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
        state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin",
                                      None)
    elif sync_method == "logical_initial_interrupted":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        LOGGER.info(
            "Initial stage of full table sync was interrupted. resuming...")
        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
    else:
        raise Exception("unknown sync method {} for stream {}".format(
            sync_method, stream["tap_stream_id"]))

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
示例#15
0
def do_discovery(conn_config):
    all_streams = []

    with post_db.open_connection(conn_config) as conn:
        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                         name="stitch_cursor") as cur:
            cur.itersize = post_db.cursor_iter_size
            sql = """SELECT datname
            FROM pg_database
            WHERE datistemplate = false
              AND datname != 'rdsadmin'"""

            if conn_config.get("filter_dbs"):
                sql = post_db.filter_dbs_sql_clause(sql,
                                                    conn_config["filter_dbs"])

            LOGGER.info("Running DB discovery: %s with itersize %s", sql,
                        cur.itersize)
            cur.execute(sql)
            found_dbs = (row[0] for row in cur.fetchall())

    filter_dbs = filter(
        lambda dbname: attempt_connection_to_db(conn_config, dbname),
        found_dbs)

    for db_row in filter_dbs:
        dbname = db_row
        LOGGER.info("Discovering db %s", dbname)
        conn_config["dbname"] = dbname
        with post_db.open_connection(conn_config) as conn:
            db_streams = discover_db(conn)
            all_streams = all_streams + db_streams

    if len(all_streams) == 0:
        raise RuntimeError(
            "0 tables were discovered across the entire cluster")

    dump_catalog(all_streams)
    return all_streams
示例#16
0
def open_connection(conn_config, logical_replication=False):
    cfg = {
        "host": conn_config["host"],
        "dbname": conn_config["dbname"],
        "user": conn_config["user"],
        "password": conn_config["password"],
        "port": conn_config["port"],
        "connect_timeout": 30,
    }

    if conn_config.get("sslmode"):
        cfg["sslmode"] = conn_config["sslmode"]

    if logical_replication:
        cfg["connection_factory"] = psycopg2.extras.LogicalReplicationConnection

    conn = psycopg2.connect(**cfg)
    LOGGER.info(
        "Connected with sslmode = %s",
        conn.get_dsn_parameters().get("sslmode", "unknown"),
    )

    return conn
示例#17
0
def attempt_connection_to_db(conn_config, dbname):
    nascent_config = copy.deepcopy(conn_config)
    nascent_config["dbname"] = dbname
    LOGGER.info("(%s) Testing connectivity...", dbname)
    try:
        conn = post_db.open_connection(nascent_config)
        LOGGER.info("(%s) connectivity verified", dbname)
        conn.close()
        return True
    except Exception as err:
        LOGGER.warning(
            'Unable to connect to %s. This is OK if you do not replicate from this database: "%s"',
            dbname,
            err,
        )
        return False
示例#18
0
def do_sync(conn_config, catalog, default_replication_method, state):
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog["streams"]))
    streams.sort(key=lambda s: s["tap_stream_id"])
    LOGGER.info("Selected streams: %s ",
                list(map(lambda s: s["tap_stream_id"], streams)))
    if any_logical_streams(streams, default_replication_method):
        LOGGER.info("Use of logical replication requires fetching an lsn...")
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.info("End LSN: %s ", end_lsn)
    else:
        end_lsn = None

    sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(
        streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.info("found currently_syncing: %s", currently_syncing)
        currently_syncing_stream = list(
            filter(lambda s: s["tap_stream_id"] == currently_syncing,
                   traditional_streams))
        if currently_syncing_stream is None:
            LOGGER.warning(
                "Can't find currently_syncing(%s) in selected traditional streams(%s). Will ignore",
                currently_syncing,
                list(map(lambda s: s["tap_stream_id"], traditional_streams)),
            )
        other_streams = list(
            filter(lambda s: s["tap_stream_id"] != currently_syncing,
                   traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No currently_syncing found")

    for stream in traditional_streams:
        state = sync_traditional_stream(
            conn_config, stream, state,
            sync_method_lookup[stream["tap_stream_id"]], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s["metadata"]).get(
        ()).get("database-name"))
    for dbname, streams in itertools.groupby(
            logical_streams, lambda s: metadata.to_map(s["metadata"]).get(
                ()).get("database-name")):
        conn_config["dbname"] = dbname
        state = sync_logical_streams(conn_config, list(streams), state,
                                     end_lsn)
    return state
示例#19
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version")
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version", stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version
    )

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get("replication-key")
    replication_key_value = singer.get_bookmark(
        state, stream["tap_stream_id"], "replication_key_value"
    )
    replication_key_sql_datatype = md_map.get(("properties", replication_key)).get("sql-datatype")

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor"
            ) as cur:
                cur.itersize = post_db.cursor_iter_size
                LOGGER.info("Beginning new incremental replication sync %s", stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} > '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value,
                        replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key),
                    )
                else:
                    # if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                    )

                LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns, time_extracted, md_map
                    )
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    # Picking a replication_key with NULL values will result in it ALWAYS being
                    # synced which is not great. Even worse would be allowing the NULL value to
                    # enter into the state.
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state,
                            stream["tap_stream_id"],
                            "replication_key_value",
                            record_message.record[replication_key],
                        )

                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    LOGGER.info("Incremental table tap complete")
    return state
示例#20
0
def sync_method_for_streams(streams, state, default_replication_method):
    lookup = {}
    traditional_steams = []
    logical_streams = []

    for stream in streams:
        stream_metadata = metadata.to_map(stream["metadata"])
        replication_method = stream_metadata.get(
            (), {}).get("replication-method", default_replication_method)
        replication_key = stream_metadata.get((), {}).get("replication-key")

        state = clear_state_on_replication_change(state,
                                                  stream["tap_stream_id"],
                                                  replication_key,
                                                  replication_method)

        if replication_method not in set(
            ["LOG_BASED", "FULL_TABLE", "INCREMENTAL"]):
            raise Exception(
                "Unrecognized replication_method {} for stream {}".format(
                    replication_method, stream["tap_stream_id"]))

        md_map = metadata.to_map(stream["metadata"])
        desired_columns = [
            c for c in stream["schema"]["properties"].keys()
            if sync_common.should_sync_column(md_map, c)
        ]
        desired_columns.sort()

        if len(desired_columns) == 0:
            LOGGER.warning(
                "There are no columns selected for stream %s, skipping it",
                stream["tap_stream_id"])
            continue

        if replication_method == "LOG_BASED" and stream_metadata.get(
            (), {}).get("is-view"):
            raise Exception(
                "Logical Replication is NOT supported for views."
                "Please change the replication method for {}".format(
                    stream["tap_stream_id"]))

        if replication_method == "FULL_TABLE":
            lookup[stream["tap_stream_id"]] = "full"
            traditional_steams.append(stream)
        elif replication_method == "INCREMENTAL":
            lookup[stream["tap_stream_id"]] = "incremental"
            traditional_steams.append(stream)

        elif get_bookmark(state,
                          stream["tap_stream_id"], "xmin") and get_bookmark(
                              state, stream["tap_stream_id"], "lsn"):
            # finishing previously interrupted full-table (first stage of logical replication)
            lookup[stream["tap_stream_id"]] = "logical_initial_interrupted"
            traditional_steams.append(stream)

        # inconsistent state
        elif get_bookmark(
                state, stream["tap_stream_id"], "xmin") and not get_bookmark(
                    state, stream["tap_stream_id"], "lsn"):
            raise Exception(
                "Xmin found(%s) in state implying full-table replication but no lsn is present"
            )

        elif not get_bookmark(state, stream["tap_stream_id"],
                              "xmin") and not get_bookmark(
                                  state, stream["tap_stream_id"], "lsn"):
            # initial full-table phase of logical replication
            lookup[stream["tap_stream_id"]] = "logical_initial"
            traditional_steams.append(stream)

        else:  # no xmin but we have an lsn
            # initial stage of logical replication(full-table) has been completed.
            # Moving onto pure logical replication
            lookup[stream["tap_stream_id"]] = "pure_logical"
            logical_streams.append(stream)

    return lookup, traditional_steams, logical_streams
def sync_tables(conn_info, logical_streams, state, end_lsn):
    start_lsn = min(
        [get_bookmark(state, s["tap_stream_id"], "lsn") for s in logical_streams]
    )
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    last_lsn_processed = None
    poll_total_seconds = (
        conn_info["logical_poll_total_seconds"] or 60 * 30
    )  # we are willing to poll for a total of 30 minutes without finding a record
    keep_alive_time = 10.0
    begin_ts = datetime.datetime.now()

    for s in logical_streams:
        sync_common.send_schema_message(s, ["lsn"])

    with post_db.open_connection(conn_info, True) as conn:
        with conn.cursor() as cur:
            LOGGER.info(
                "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s",
                list(map(lambda s: s["tap_stream_id"], logical_streams)),
                slot,
                start_lsn,
                end_lsn,
                poll_total_seconds,
            )
            try:
                cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn)
            except psycopg2.ProgrammingError:
                raise Exception(
                    "unable to start replication with logical replication slot {}".format(
                        slot
                    )
                )

            rows_saved = 0
            while True:
                poll_duration = (datetime.datetime.now() - begin_ts).total_seconds()
                if poll_duration > poll_total_seconds:
                    LOGGER.info(
                        "breaking after %s seconds of polling with no data",
                        poll_duration,
                    )
                    break

                msg = cur.read_message()
                if msg:
                    begin_ts = datetime.datetime.now()
                    if msg.data_start > end_lsn:
                        LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn)
                        break

                    state = consume_message(
                        logical_streams, state, msg, time_extracted, conn_info, end_lsn
                    )
                    # msg has been consumed. it has been processed
                    last_lsn_processed = msg.data_start
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state))
                        )

                else:
                    now = datetime.datetime.now()
                    timeout = keep_alive_time - (now - cur.io_timestamp).total_seconds()
                    try:
                        sel = select([cur], [], [], max(0, timeout))
                        if not any(sel):
                            LOGGER.info(
                                (
                                    "no data for {} seconds. sending feedback to server with NO "
                                    "flush_lsn. just a keep-alive"
                                ).format(timeout)
                            )
                            cur.send_feedback()

                    except InterruptedError:
                        pass  # recalculate timeout and continue

    if last_lsn_processed:
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream %s to last_lsn_processed %s",
                s["tap_stream_id"],
                last_lsn_processed,
            )
            state = singer.write_bookmark(
                state, s["tap_stream_id"], "lsn", last_lsn_processed
            )

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
示例#22
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream["tap_stream_id"], "xmin") is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream["tap_stream_id"],
                                                     "version")

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size

                xmin = singer.get_bookmark(state, stream["tap_stream_id"],
                                           "xmin")
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version,
                        xmin,
                    )
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                        xmin,
                    )
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                    )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0
                for rec in cur:
                    xmin = rec["xmin"]
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream["tap_stream_id"],
                                                  "xmin", xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

                LOGGER.info(
                    "Processing complete - saved {} rows".format(rows_saved))

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    LOGGER.info("Writing bookmark")
    state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None)

    # always send the activate version whether first run or subsequent
    LOGGER.info("Sending activate version message")
    singer.write_message(activate_version_message)

    LOGGER.info("Full table tap complete")
    return state
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
    payload = json.loads(msg.payload)
    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s["tap_stream_id"]] = s

    for c in payload["change"]:
        tap_stream_id = post_db.compute_tap_stream_id(
            conn_info["dbname"], c["schema"], c["table"]
        )
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream["tap_stream_id"], state)
        stream_md_map = metadata.to_map(target_stream["metadata"])

        desired_columns = [
            c
            for c in target_stream["schema"]["properties"].keys()
            if sync_common.should_sync_column(stream_md_map, c)
        ]

        if c["kind"] == "insert":
            col_names = []
            col_vals = []
            for idx, col in enumerate(c["columnnames"]):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c["columnvalues"][idx])

            col_names = col_names + ["_sdc_deleted_at"]
            col_vals = col_vals + [None]
            if conn_info.get("debug_lsn"):
                col_names = col_names + ["_sdc_lsn"]
                col_vals = col_vals + [str(lsn)]
            record_message = row_to_singer_message(
                target_stream,
                col_vals,
                stream_version,
                col_names,
                time_extracted,
                stream_md_map,
                conn_info,
            )

        elif c["kind"] == "update":
            col_names = []
            col_vals = []
            for idx, col in enumerate(c["columnnames"]):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c["columnvalues"][idx])

            col_names = col_names + ["_sdc_deleted_at"]
            col_vals = col_vals + [None]

            if conn_info.get("debug_lsn"):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ["_sdc_lsn"]
            record_message = row_to_singer_message(
                target_stream,
                col_vals,
                stream_version,
                col_names,
                time_extracted,
                stream_md_map,
                conn_info,
            )

        elif c["kind"] == "delete":
            col_names = []
            col_vals = []
            for idx, col in enumerate(c["oldkeys"]["keynames"]):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c["oldkeys"]["keyvalues"][idx])

            col_names = col_names + ["_sdc_deleted_at"]
            col_vals = col_vals + [singer.utils.strftime(time_extracted)]
            if conn_info.get("debug_lsn"):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ["_sdc_lsn"]
            record_message = row_to_singer_message(
                target_stream,
                col_vals,
                stream_version,
                col_names,
                time_extracted,
                stream_md_map,
                conn_info,
            )

        else:
            raise Exception("unrecognized replication operation: {}".format(c["kind"]))

        singer.write_message(record_message)
        state = singer.write_bookmark(state, target_stream["tap_stream_id"], "lsn", lsn)
        LOGGER.debug("sending feedback to server with NO flush_lsn. just a keep-alive")
        msg.cursor.send_feedback()

    LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
    if msg.data_start > end_lsn:
        raise Exception(
            "incorrectly attempting to flush an lsn({}) > end_lsn({})".format(
                msg.data_start, end_lsn
            )
        )

    msg.cursor.send_feedback(flush_lsn=msg.data_start)

    return state
示例#24
0
def main():
    try:
        main_impl()
    except Exception as exc:
        LOGGER.critical(exc)
        raise exc