def consume_message(streams, state, msg, time_extracted, conn_info): payload = json.loads(msg.payload) lsn = msg.data_start streams_lookup = {} for s in streams: streams_lookup[s['tap_stream_id']] = s for c in payload['change']: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) if streams_lookup.get(tap_stream_id) is None: continue target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) if c['kind'] == 'insert': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'update': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'delete': col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at'] col_vals = c['oldkeys']['keyvalues'] + [ singer.utils.strftime(time_extracted) ] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format( c['kind'])) sync_common.send_schema_message(target_stream, ['lsn']) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) LOGGER.debug( "sending feedback to server with NO flush_lsn. just a keep-alive") msg.cursor.send_feedback() LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start) msg.cursor.send_feedback(flush_lsn=msg.data_start) return state
def consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn): ## Action Types: # I = Insert # U = Update # D = Delete # B = Begin Transaction # C = Commit Transaction # M = Message # T = Truncate action = payload['action'] if action not in ['U', 'I', 'D']: LOGGER.debug("Skipping message of type %s", action) yield None else: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], payload['schema'], payload['table']) if streams_lookup.get(tap_stream_id) is None: yield None else: target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = [col for col in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, col)] col_names = [] col_vals = [] if payload['action'] in ['I', 'U']: for column in payload['columns']: if column['name'] in set(desired_columns): col_names.append(column['name']) col_vals.append(column['value']) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_names = col_names + ['_sdc_lsn'] col_vals = col_vals + [str(lsn)] elif payload['action'] == 'D': for column in payload['identity']: if column['name'] in set(desired_columns): col_names.append(column['name']) col_vals.append(column['value']) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [singer.utils.strftime(singer.utils.strptime_to_utc(payload['timestamp']))] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] # Yield 1 record to match the API of V1 yield row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn)
def discover_columns(connection, table_info): """ Generates more info about columns of the given table """ entries = [] for schema_name in table_info.keys(): for table_name in table_info[schema_name].keys(): mdata = {} columns = table_info[schema_name][table_name]['columns'] table_pks = [col_name for col_name, col_info in columns.items() if col_info.is_primary_key] with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(" SELECT current_database()") database_name = cur.fetchone()[0] metadata.write(mdata, (), 'table-key-properties', table_pks) metadata.write(mdata, (), 'schema-name', schema_name) metadata.write(mdata, (), 'database-name', database_name) metadata.write(mdata, (), 'row-count', table_info[schema_name][table_name]['row_count']) metadata.write(mdata, (), 'is-view', table_info[schema_name][table_name].get('is_view')) column_schemas = {col_name: schema_for_column(col_info) for col_name, col_info in columns.items()} schema = {'type': 'object', 'properties': column_schemas, 'definitions': {}} schema = include_array_schemas(columns, schema) for c_name in column_schemas.keys(): mdata = write_sql_data_type_md(mdata, columns[c_name]) if column_schemas[c_name].get('type') is None: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'unsupported') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', False) elif table_info[schema_name][table_name]['columns'][c_name].is_primary_key: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) else: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) entry = {'table_name': table_name, 'stream': table_name, 'metadata': metadata.to_list(mdata), 'tap_stream_id': post_db.compute_tap_stream_id(schema_name, table_name), 'schema': schema} entries.append(entry) return entries
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn): payload = json.loads(msg.payload) lsn = msg.data_start streams_lookup = {} for s in streams: streams_lookup[s['tap_stream_id']] = s for c in payload['change']: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) if streams_lookup.get(tap_stream_id) is None: continue target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = [ c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c) ] if c['kind'] == 'insert': col_names = [] col_vals = [] for idx, col in enumerate(c['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_names = col_names + ['_sdc_lsn'] col_vals = col_vals + [str(lsn)] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'update': col_names = [] col_vals = [] for idx, col in enumerate(c['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'delete': col_names = [] col_vals = [] for idx, col in enumerate(c['oldkeys']['keynames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['oldkeys']['keyvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [singer.utils.strftime(time_extracted)] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format( c['kind'])) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) LOGGER.debug( "sending feedback to server with NO flush_lsn. just a keep-alive") msg.cursor.send_feedback() LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start) if msg.data_start > end_lsn: raise Exception( "incorrectly attempting to flush an lsn({}) > end_lsn({})".format( msg.data_start, end_lsn)) msg.cursor.send_feedback(flush_lsn=msg.data_start) return state
def consume_message(streams, state, msg, time_extracted, conn_info): # Strip leading comma generated by write-in-chunks and parse valid JSON try: payload = json.loads(msg.payload.lstrip(',')) except Exception: return state lsn = msg.data_start streams_lookup = {s['tap_stream_id']: s for s in streams} tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table']) if streams_lookup.get(tap_stream_id) is None: return state target_stream = streams_lookup[tap_stream_id] if payload['kind'] not in {'insert', 'update', 'delete'}: raise UnsupportedPayloadKindError( f"unrecognized replication operation: {payload['kind']}") # Get the additional fields in payload that are not in schema properties: # only inserts and updates have the list of columns that can be used to detect any different in columns diff = set() if payload['kind'] in {'insert', 'update'}: diff = set(payload['columnnames']).difference( target_stream['schema']['properties'].keys()) # if there is new columns in the payload that are not in the schema properties then refresh the stream schema if diff: LOGGER.info( 'Detected new columns "%s", refreshing schema of stream %s', diff, target_stream['stream']) # encountered a column that is not in the schema # refresh the stream schema and metadata by running discovery refresh_streams_schema(conn_info, [target_stream]) # add the automatic properties back to the stream add_automatic_properties(target_stream, conn_info.get('debug_lsn', False)) # publish new schema sync_common.send_schema_message(target_stream, ['lsn']) stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = { c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c) } if payload['kind'] in {'insert', 'update'}: col_names = [] col_vals = [] for idx, col in enumerate(payload['columnnames']): if col in desired_columns: col_names.append(col) col_vals.append(payload['columnvalues'][idx]) col_names.append('_sdc_deleted_at') col_vals.append(None) if conn_info.get('debug_lsn'): col_names.append('_sdc_lsn') col_vals.append(str(lsn)) record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif payload['kind'] == 'delete': col_names = [] col_vals = [] for idx, col in enumerate(payload['oldkeys']['keynames']): if col in desired_columns: col_names.append(col) col_vals.append(payload['oldkeys']['keyvalues'][idx]) col_names.append('_sdc_deleted_at') col_vals.append(singer.utils.strftime(time_extracted)) if conn_info.get('debug_lsn'): col_names.append('_sdc_lsn') col_vals.append(str(lsn)) record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) return state
def consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn): for c in payload['change']: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) if streams_lookup.get(tap_stream_id) is None: continue target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = [ c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c) ] if c['kind'] == 'insert': col_names = [] col_vals = [] for idx, col in enumerate(c['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_names = col_names + ['_sdc_lsn'] col_vals = col_vals + [str(lsn)] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'update': col_names = [] col_vals = [] for idx, col in enumerate(c['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'delete': col_names = [] col_vals = [] for idx, col in enumerate(c['oldkeys']['keynames']): if col in set(desired_columns): col_names.append(col) col_vals.append(c['oldkeys']['keyvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [singer.utils.strftime(time_extracted)] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format( c['kind'])) yield record_message state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn)
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn): # Strip leading comma generated by write-in-chunks and parse valid JSON try: payload = json.loads(msg.payload.lstrip(',')) except: return state lsn = msg.data_start streams_lookup = {} for s in streams: streams_lookup[s['tap_stream_id']] = s tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table']) if streams_lookup.get(tap_stream_id) is None: return state target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = [ c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c) ] if payload['kind'] == 'insert': col_names = [] col_vals = [] for idx, col in enumerate(payload['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(payload['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_names = col_names + ['_sdc_lsn'] col_vals = col_vals + [str(lsn)] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif payload['kind'] == 'update': col_names = [] col_vals = [] for idx, col in enumerate(payload['columnnames']): if col in set(desired_columns): col_names.append(col) col_vals.append(payload['columnvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [None] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif payload['kind'] == 'delete': col_names = [] col_vals = [] for idx, col in enumerate(payload['oldkeys']['keynames']): if col in set(desired_columns): col_names.append(col) col_vals.append(payload['oldkeys']['keyvalues'][idx]) col_names = col_names + ['_sdc_deleted_at'] col_vals = col_vals + [singer.utils.strftime(time_extracted)] if conn_info.get('debug_lsn'): col_vals = col_vals + [str(lsn)] col_names = col_names + ['_sdc_lsn'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format( payload['kind'])) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) # Below is the behaviour of the original tap-progres to flush the source server wal to the latest lsn received in the current run # The Pipelinewise version flushes only at the start of the next run to ensure the data has been comitted on the destination server # if msg.data_start > end_lsn: # raise Exception("incorrectly attempting to flush an lsn({}) > end_lsn({})".format(msg.data_start, end_lsn)) # LOGGER.info("Confirming write up to {}, flush to {}".format(int_to_lsn(msg.data_start), int_to_lsn(msg.data_start))) # msg.cursor.send_feedback(write_lsn=msg.data_start, flush_lsn=msg.data_start, reply=True) return state
def discover_columns(connection, table_info): entries = [] for schema_name in table_info.keys(): for table_name in table_info[schema_name].keys(): mdata = {} columns = table_info[schema_name][table_name]["columns"] table_pks = [ col_name for col_name, col_info in columns.items() if col_info.is_primary_key ] with connection.cursor( cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(" SELECT current_database()") database_name = cur.fetchone()[0] metadata.write(mdata, (), "table-key-properties", table_pks) metadata.write(mdata, (), "schema-name", schema_name) metadata.write(mdata, (), "database-name", database_name) metadata.write(mdata, (), "row-count", table_info[schema_name][table_name]["row_count"]) metadata.write(mdata, (), "is-view", table_info[schema_name][table_name].get("is_view")) column_schemas = { col_name: schema_for_column(col_info) for col_name, col_info in columns.items() } schema = { "type": "object", "properties": column_schemas, "definitions": {} } schema = include_array_schemas(columns, schema) for c_name in column_schemas.keys(): mdata = write_sql_data_type_md(mdata, columns[c_name]) if column_schemas[c_name].get("type") is None: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "unsupported") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", False) elif table_info[schema_name][table_name]["columns"][ c_name].is_primary_key: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "automatic") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", True) else: mdata = metadata.write(mdata, ("properties", c_name), "inclusion", "available") mdata = metadata.write(mdata, ("properties", c_name), "selected-by-default", True) entry = { "table_name": table_name, "stream": table_name, "metadata": metadata.to_list(mdata), "tap_stream_id": post_db.compute_tap_stream_id(database_name, schema_name, table_name), "schema": schema, } entries.append(entry) return entries