예제 #1
0
def row_to_singer_message(stream, row, version, columns, time_extracted,
                          md_map, conn_info):
    row_to_persist = ()
    md_map[('properties', '_sdc_deleted_at')] = {
        'sql-datatype': 'timestamp with time zone'
    }
    md_map[('properties', '_sdc_lsn')] = {'sql-datatype': "character varying"}

    for idx, elem in enumerate(row):
        sql_datatype = md_map.get(
            ('properties', columns[idx])).get('sql-datatype')

        if not sql_datatype:
            LOGGER.info("No sql-datatype found for stream %s: %s", stream,
                        columns[idx])
            raise Exception(
                "Unable to find sql-datatype for stream {}".format(stream))

        cleaned_elem = selected_value_to_singer_value(elem, sql_datatype,
                                                      conn_info)
        row_to_persist += (cleaned_elem, )

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        record=rec,
        version=version,
        time_extracted=time_extracted)
예제 #2
0
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream['metadata'])
    if s_md.get((), {}).get('is-view'):
        key_properties = s_md.get((), {}).get('view-key-properties', [])
    else:
        key_properties = s_md.get((), {}).get('table-key-properties', [])

    sanitized_stream = stream['schema']
    sanitized_key_properties = [
        x.replace("_sdc_", "_orig_sdc_") for x in key_properties
    ]
    sanitized_bookmark_properties = [
        x.replace("_sdc_", "_orig_sdc_") for x in bookmark_properties
    ]

    for x in sanitized_stream['properties'].keys():
        if x.startswith(("_sdc_", "__sdc_")):
            sanitized_stream['properties'][x.replace(
                "_sdc_", "_orig_sdc_")] = sanitized_stream['properties'].pop(x)

    schema_message = {
        'type': 'SCHEMA',
        'stream': post_db.calculate_destination_stream_name(stream, s_md),
        'schema': sanitized_stream,
        'key_properties': sanitized_key_properties,
        'bookmark_properties': sanitized_bookmark_properties
    }

    write_schema_message(schema_message)
예제 #3
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size
                select_sql = "SELECT {} FROM {}".format(
                    ",".join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream["table_name"]),
                )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
예제 #4
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                select_sql = f"SELECT {','.join(escaped_columns)} FROM " \
                             f"{post_db.fully_qualified_table_name(schema_name,stream['table_name'])}"

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved += 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
예제 #5
0
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream['metadata'])
    if s_md.get((), {}).get('is-view'):
        key_properties = s_md.get((), {}).get('view-key-properties', [])
    else:
        key_properties = s_md.get((), {}).get('table-key-properties', [])

    schema_message = {
        'type': 'SCHEMA',
        'stream': post_db.calculate_destination_stream_name(stream, s_md),
        'schema': stream['schema'],
        'key_properties': key_properties,
        'bookmark_properties': bookmark_properties
    }

    write_schema_message(schema_message)
예제 #6
0
def row_to_singer_message(stream, row, version, columns, time_extracted, md_map, conn_info):
    row_to_persist = ()
    md_map[('properties', '_sdc_deleted_at')] = {'sql-datatype' : 'timestamp with time zone'}

    for idx, elem in enumerate(row):
        sql_datatype = md_map.get(('properties', columns[idx]))['sql-datatype']
        cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info)
        row_to_persist += (cleaned_elem,)

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        record=rec,
        version=version,
        time_extracted=time_extracted)
예제 #7
0
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream["metadata"])
    if s_md.get((), {}).get("is-view"):
        key_properties = s_md.get((), {}).get("view-key-properties", [])
    else:
        key_properties = s_md.get((), {}).get("table-key-properties", [])

    schema_message = {
        "type": "SCHEMA",
        "stream": post_db.calculate_destination_stream_name(stream, s_md),
        "schema": stream["schema"],
        "key_properties": key_properties,
        "bookmark_properties": bookmark_properties,
    }

    write_schema_message(schema_message)
예제 #8
0
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream['metadata'])
    if s_md.get((), {}).get('is-view'):
        key_properties = s_md.get((), {}).get('view-key-properties', [])
    else:
        key_properties = s_md.get((), {}).get('table-key-properties', [])

    filtered_stream = copy.deepcopy(stream)

    for _, column_name in enumerate(stream['schema']['properties']):
        if not should_sync_column(s_md, column_name):
            del filtered_stream['schema']['properties'][column_name]

    schema_message = {
        'type': 'SCHEMA',
        'stream': post_db.calculate_destination_stream_name(stream),
        'schema': filtered_stream['schema'],
        'key_properties': key_properties,
        'bookmark_properties': bookmark_properties
    }

    write_schema_message(schema_message)
예제 #9
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=stream_version)

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get('replication-key')
    replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                                'replication_key_value')
    replication_key_sql_datatype = md_map.get(
        ('properties', replication_key)).get('sql-datatype')

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='pipelinewise') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} >= '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value, replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key))
                else:
                    #if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key))

                LOGGER.info('select statement: %s with itersize %s',
                            select_sql, cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns,
                        time_extracted, md_map)

                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
                    #event worse would be allowing the NULL value to enter into the state
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state, stream['tap_stream_id'],
                            'replication_key_value',
                            record_message.record[replication_key])

                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    return state
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream['tap_stream_id'],
                                                     'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE

                fq_table_name = post_db.fully_qualified_table_name(
                    schema_name, stream['table_name'])
                xmin = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name, xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name)

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream['tap_stream_id'],
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None)

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
예제 #11
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version")
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version", stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version
    )

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get("replication-key")
    replication_key_value = singer.get_bookmark(
        state, stream["tap_stream_id"], "replication_key_value"
    )
    replication_key_sql_datatype = md_map.get(("properties", replication_key)).get("sql-datatype")

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(
                cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor"
            ) as cur:
                cur.itersize = post_db.cursor_iter_size
                LOGGER.info("Beginning new incremental replication sync %s", stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} > '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value,
                        replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key),
                    )
                else:
                    # if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                    )

                LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns, time_extracted, md_map
                    )
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    # Picking a replication_key with NULL values will result in it ALWAYS being
                    # synced which is not great. Even worse would be allowing the NULL value to
                    # enter into the state.
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state,
                            stream["tap_stream_id"],
                            "replication_key_value",
                            record_message.record[replication_key],
                        )

                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    LOGGER.info("Incremental table tap complete")
    return state
예제 #12
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream["tap_stream_id"], "xmin") is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream["tap_stream_id"],
                                                     "version")

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size

                xmin = singer.get_bookmark(state, stream["tap_stream_id"],
                                           "xmin")
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version,
                        xmin,
                    )
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                        xmin,
                    )
                else:
                    LOGGER.info(
                        "Beginning new Full Table replication %s",
                        nascent_stream_version,
                    )
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                    )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0
                for rec in cur:
                    xmin = rec["xmin"]
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream,
                        rec,
                        nascent_stream_version,
                        desired_columns,
                        time_extracted,
                        md_map,
                    )
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream["tap_stream_id"],
                                                  "xmin", xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

                LOGGER.info(
                    "Processing complete - saved {} rows".format(rows_saved))

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    LOGGER.info("Writing bookmark")
    state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None)

    # always send the activate version whether first run or subsequent
    LOGGER.info("Sending activate version message")
    singer.write_message(activate_version_message)

    LOGGER.info("Full table tap complete")
    return state