def send_schema_message(stream, bookmark_properties): s_md = metadata.to_map(stream.metadata) if s_md.get((), {}).get('is-view'): key_properties = s_md.get((), {}).get('view-key-properties') else: key_properties = s_md.get((), {}).get('table-key-properties') schema_message = singer.SchemaMessage(stream=(stream.tap_stream_id or stream.stream), schema=stream.schema.to_dict(), key_properties=key_properties, bookmark_properties=bookmark_properties) singer.write_message(schema_message)
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter): bulk = Bulk(sf) current_bookmark = singer.get_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry) current_bookmark = singer_utils.strptime_with_tz(current_bookmark) batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'BatchIDs') start_time = singer_utils.now() stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry.get('metadata')) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) schema = catalog_entry['schema'] if not bulk.job_exists(job_id): LOGGER.info( "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state." ) return counter # Iterate over the remaining batches, removing them once they are synced for batch_id in batch_ids[:]: with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) # Update bookmark if necessary replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark: current_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(current_bookmark)) batch_ids.remove(batch_id) LOGGER.info("Finished syncing batch %s. Removing batch from state.", batch_id) LOGGER.info("Batches to go: %d", len(batch_ids)) singer.write_state(state) return counter
def sync(self): """ Perform sync action These steps are the same for all streams Differences between streams are implemented by overriding .do_sync() method """ if not self.KEEP_IDS and not self.include_stream: LOGGER.info('Skipping stream %s - excluded in catalog', self.STREAM_NAME) return new_bookmark_date = self.bookmark_date = self.starting_bookmark_date() # amazon doesn't guarantee that all orders created after the createdafter data that you specify will be returned # Will be set to false if we stop early due to reaching the end of a batch # to tell the runner to continue with the next batch all_done = True singer.write_schema(self.STREAM_NAME, self.schema, self.key_properties) rows = self.request_list() self.ids = [] with singer.metrics.Counter('record_count', {'endpoint': self.STREAM_NAME}) as counter: for row in rows: row_as_dict = self.row_to_dict(row) if self.KEEP_IDS: self.ids.append(row_as_dict[self.ID_FIELD]) self.remove_excluded_fields(row_as_dict) message = singer.RecordMessage( stream=self.STREAM_NAME, record=row_as_dict, time_extracted=singer.utils.now()) if self.include_stream: singer.write_message(message) if self.BOOKMARK_FIELD: new_bookmark_date = max(new_bookmark_date, row_as_dict[self.BOOKMARK_FIELD]) counter.increment() # Stop if we've done enough for one batch if self.BATCH_SIZE and counter.value >= self.BATCH_SIZE: # Sync action stopped due to end of batch - so probably more rows # Note that there is a 1/BATCH_SIZE chance that the end of a # batch is exactly the end of the whole process. In that case # the runner will make one more .sync request, for one more (empty) batch all_done = False break if self.BOOKMARK_FIELD: singer.write_bookmark(self.state, self.STREAM_NAME, self.BOOKMARK_FIELD, new_bookmark_date) return all_done
def do_sync_incremental(mssql_conn, config, catalog_entry, state, columns): mssql_conn = MSSQLConnection(config) md_map = metadata.to_map(catalog_entry.metadata) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) replication_key = md_map.get((), {}).get("replication-key") write_schema_message(catalog_entry=catalog_entry, bookmark_properties=[replication_key]) LOGGER.info("Schema written") incremental.sync_table(mssql_conn, config, catalog_entry, state, columns) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_binlog_stream(mysql_conn: MySQLConnection, config: Dict, binlog_streams_map: Dict[str, Any], state: Dict) -> None: """ Capture the binlog events created between the pos in the state and current Master position and creates Singer streams to be flushed to stdout Args: mysql_conn: mysql connection instance config: tap config binlog_streams_map: tables to stream using binlog state: the current state """ for tap_stream_id in binlog_streams_map: common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file = log_pos = gtid = None if config['use_gtid']: gtid = calculate_gtid_bookmark(mysql_conn, binlog_streams_map, state, config['engine']) else: log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) reader = None try: reader = create_binlog_stream_reader(config, log_file, log_pos, gtid) end_log_file, end_log_pos = fetch_current_log_file_and_pos(mysql_conn) LOGGER.info('Current Master binlog file and pos: %s %s', end_log_file, end_log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config, end_log_file, end_log_pos) except pymysql.err.OperationalError as ex: if ex.args[0] == 1236: LOGGER.error( 'Cannot resume logical replication from given GTID %s! This GTID might date back to before ' 'the new primary has been setup, connect to old primary and consume all binlog events to get ' 'a newer GTID then switch back.', gtid) raise finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top if reader: reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] sync_one_one = config.get('sync_one_one', "True") if sync_one_one or sync_one_one == "True" or sync_one_one == "true": sync_one_one = True elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false": sync_one_one = False else: raise Exception("Don't understand sync_one_one param in config, must be boolean") table_name = table_spec['table_name'] s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: if not sync_one_one: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) if sync_one_one: write_message( OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path)) records_synced += 1 return records_synced
def sync_stream(config, state, stream): table_name = stream['tap_stream_id'] md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') key_properties = metadata.get(md_map, (), 'table-key-properties') # write state message with currently_syncing bookmark state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, table_name) singer.write_state(state) singer.write_message( singer.SchemaMessage(stream=table_name, schema=stream['schema'], key_properties=key_properties)) rows_saved = 0 if replication_method == 'FULL_TABLE': LOGGER.info("Syncing full table for stream: %s", table_name) rows_saved += sync_full_table(config, state, stream) elif replication_method == 'LOG_BASED': LOGGER.info("Syncing log based for stream: %s", table_name) if has_stream_aged_out(config, state, stream): LOGGER.info("Clearing state because stream has aged out") state.get('bookmarks', {}).pop(table_name) # TODO Check to see if latest stream ARN has changed and wipe state if so if not singer.get_bookmark(state, table_name, 'initial_full_table_complete'): msg = 'Must complete full table sync before replicating from dynamodb streams for %s' LOGGER.info(msg, table_name) # only mark latest sequence numbers in dynamo streams on first sync so # tap has a starting point after the full table sync if not singer.get_bookmark(state, table_name, 'version'): latest_sequence_numbers = get_latest_seq_numbers( config, stream) state = singer.write_bookmark(state, table_name, 'shard_seq_numbers', latest_sequence_numbers) rows_saved += sync_full_table(config, state, stream) rows_saved += sync_log_based(config, state, stream) else: LOGGER.info('Unknown replication method: %s for stream: %s', replication_method, table_name) return rows_saved
def sync_view(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: select_sql = 'SELECT {} FROM {}'.format( ','.join(escaped_columns), post_db.fully_qualified_table_name(schema_name, stream.table)) LOGGER.info("select %s", select_sql) cur.execute(select_sql) rows_saved = 0 rec = cur.fetchone() while rec is not None: record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() rec = cur.fetchone() #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn): LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream['tap_stream_id'], sync_method) md_map = metadata.to_map(stream['metadata']) conn_config['dbname'] = md_map.get(()).get('database-name') desired_columns = [ c for c in stream['schema']['properties'].keys() if sync_common.should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( 'There are no columns selected for stream %s, skipping it', stream['tap_stream_id']) return state register_type_adapters(conn_config) if sync_method == 'full': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'incremental': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'logical_initial': state = singer.set_currently_syncing(state, stream['tap_stream_id']) LOGGER.info("Performing initial full table sync") state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn', end_lsn) sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) elif sync_method == 'logical_initial_interrupted': state = singer.set_currently_syncing(state, stream['tap_stream_id']) LOGGER.info( "Initial stage of full table sync was interrupted. resuming...") sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception("unknown sync method {} for stream {}".format( sync_method, stream['tap_stream_id'])) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_table(mysql_conn, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get("replication-key") replication_key_state = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key" ) replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key_value" ) else: state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "replication_key", replication_key_metadata ) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, "replication_key_value") stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "version", stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[replication_key_metadata].format == "date-time": replication_key_value = pendulum.parse(replication_key_value) select_sql += " WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC".format( replication_key_metadata, replication_key_metadata ) params["replication_key_value"] = replication_key_value elif replication_key_metadata is not None: select_sql += " ORDER BY `{}` ASC".format(replication_key_metadata) common.sync_query( cur, catalog_entry, state, select_sql, columns, stream_version, params )
def sync_view(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.CURSOR_ITER_SIZE select_sql = f"SELECT {','.join(escaped_columns)} FROM " \ f"{post_db.fully_qualified_table_name(schema_name,stream['table_name'])}" LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved += 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() # always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def write_schema_message(catalog_entry, bookmark_properties=None): if bookmark_properties is None: bookmark_properties = [] key_properties = get_key_properties(catalog_entry) singer.write_message( singer.SchemaMessage( stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=key_properties, bookmark_properties=bookmark_properties, ))
def do_sync(state): '''Main function for syncing''' LOGGER.info("Starting sync") urls = get_starting_urls(state) LOGGER.info('I will sync urls in this order: %s', urls) for url in urls: for msg in sync_endpoint(url, state): singer.write_message(msg) state[NEXT] = None state[LAST_START_DATE] = state[THIS_START_DATE] state[THIS_START_DATE] = None singer.write_state(state) LOGGER.info("Sync completed")
def sync_stream(config, state, stream): table_name = stream['tap_stream_id'] md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') key_properties = metadata.get(md_map, (), 'table-key-properties') # write state message with currently_syncing bookmark state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, table_name) singer.write_state(state) singer.write_message( singer.SchemaMessage(stream=table_name, schema=stream['schema'], key_properties=key_properties)) rows_saved = 0 if replication_method == 'FULL_TABLE': LOGGER.info("Syncing full table for stream: %s", table_name) rows_saved += full_table.sync(config, state, stream) elif replication_method == "QUERY": LOGGER.info(f"Syncing via a query for stream {table_name}") rows_saved += sync_query(config, state, stream) elif replication_method == 'LOG_BASED': LOGGER.info("Syncing log based for stream: %s", table_name) if log_based.has_stream_aged_out(state, table_name): LOGGER.info("Clearing state because stream has aged out") state.get('bookmarks', {}).pop(table_name) if not singer.get_bookmark(state, table_name, 'initial_full_table_complete'): msg = 'Must complete full table sync before replicating from dynamodb streams for %s' LOGGER.info(msg, table_name) state = log_based.get_initial_bookmarks(config, state, table_name) singer.write_state(state) rows_saved += full_table.sync(config, state, stream) rows_saved += log_based.sync(config, state, stream) else: LOGGER.info('Unknown replication method: %s for stream: %s', replication_method, table_name) state = singer.write_bookmark(state, table_name, 'success_timestamp', singer.utils.strftime(singer.utils.now())) singer.write_state(state) return rows_saved
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry) pk_clause = "" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if perform_resumable_sync: LOGGER.info( "Full table sync is resumable based on primary key definition, will replicate incrementally" ) state = update_incremental_full_table_state( catalog_entry, state, cur) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(conn_info, stream, state, desired_columns, md_map): start_lsn = get_bookmark(state, stream.tap_stream_id, 'lsn') end_lsn = fetch_current_lsn(conn_info) time_extracted = utils.now() with post_db.open_connection(conn_info, True) as conn: with conn.cursor() as cur: LOGGER.info("Starting Logical Replication: %s(%s) -> %s", start_lsn, start_lsn, end_lsn) try: cur.start_replication(slot_name='stitch', decode=True, start_lsn=start_lsn) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot 'stitch'" ) cur.send_feedback(flush_lsn=start_lsn) keepalive_interval = 10.0 rows_saved = 0 while True: msg = cur.read_message() if msg: skip_first_change = singer.get_bookmark( state, stream.tap_stream_id, 'initial_logical_replication_complete' ) and rows_saved == 0 state = consume_message(stream, state, msg, time_extracted, md_map, conn_info, skip_first_change) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) else: now = datetime.datetime.now() timeout = keepalive_interval - ( now - cur.io_timestamp).total_seconds() try: sel = select([cur], [], [], max(0, timeout)) if not any(sel): break except InterruptedError: pass # recalculate timeout and continue singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_shard(shard, seq_number_bookmarks, streams_client, stream_arn, projection, deserializer, table_name, stream_version, state): seq_number = seq_number_bookmarks.get(shard['ShardId']) rows_synced = 0 for record in get_shard_records(streams_client, stream_arn, shard, seq_number): if record['eventName'] == 'REMOVE': record_message = deserializer.deserialize_item( record['dynamodb']['Keys']) record_message[SDC_DELETED_AT] = singer.utils.strftime( record['dynamodb']['ApproximateCreationDateTime']) else: record_message = deserializer.deserialize_item( record['dynamodb'].get('NewImage')) if record_message is None: LOGGER.fatal( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) raise RuntimeError( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) if projection is not None and projection != '': try: record_message = deserializer.apply_projection( record_message, projection) except: LOGGER.fatal("Projection failed to apply: %s", projection) raise RuntimeError( 'Projection failed to apply: {}'.format(projection)) record_message = singer.RecordMessage(stream=table_name, record=record_message, version=stream_version) singer.write_message(record_message) rows_synced += 1 seq_number_bookmarks[ shard['ShardId']] = record['dynamodb']['SequenceNumber'] state = singer.write_bookmark(state, table_name, 'shard_seq_numbers', seq_number_bookmarks) # Every 100 rows write the state if rows_synced % 100 == 0: singer.write_state(state) singer.write_state(state) return rows_synced
def handle_line(self, line): '''Takes a raw line from stdin and transforms it''' try: message = singer.parse_message(line) if not message: raise TransformFieldException('Unknown message type') except Exception as exc: raise TransformFieldException( 'Failed to process incoming message: {}\n{}'.format(line, exc)) LOGGER.debug(message) # If we got a Schema, set the schema and key properties for this # stream. Flush the batch, if there is one, in case the schema is # different if isinstance(message, singer.SchemaMessage): self.flush() self.stream_meta[message.stream] = StreamMeta( message.schema, message.key_properties, message.bookmark_properties) # Write the transformed message singer.write_message(message) elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)): if self.messages and ( message.stream != self.messages[0].stream or message.version != self.messages[0].version): self.flush() self.messages.append(message) self.buffer_size_bytes += len(line) num_bytes = self.buffer_size_bytes num_messages = len(self.messages) num_seconds = time.time() - self.time_last_batch_sent enough_bytes = num_bytes >= self.max_batch_bytes enough_messages = num_messages >= self.max_batch_records enough_time = num_seconds >= self.batch_delay_seconds if enough_bytes or enough_messages or enough_time: LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', num_bytes, num_messages, num_seconds) self.flush() elif isinstance(message, singer.StateMessage): self.state = message.value
def sync_view(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute( """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""" ) time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # cur = connection.cursor() md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table activate_version_message = singer.ActivateVersionMessage( stream=stream.tap_stream_id, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: select_sql = 'SELECT {} FROM {}.{}'.format(','.join(escaped_columns), escaped_schema, escaped_table) LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): record_message = common.row_to_singer_message( stream, row, nascent_stream_version, desired_columns, time_extracted) singer.write_message(record_message) counter.increment() #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) cur.close() connection.close() return state
def sync_non_binlog_streams(mssql_conn, non_binlog_catalog, config, state): mssql_conn = MSSQLConnection(config) for catalog_entry in non_binlog_catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( "There are no columns selected for stream %s, skipping it.", catalog_entry.stream ) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get("replication-method") replication_key = md_map.get((), {}).get("replication-key") primary_keys = md_map.get((), {}).get("table-key-properties") LOGGER.info(f"Table {catalog_entry.table} proposes {replication_method} sync") if replication_method == "INCREMENTAL" and not replication_key: LOGGER.info( f"No replication key for {catalog_entry.table}, using full table replication" ) replication_method = "FULL_TABLE" if replication_method == "INCREMENTAL" and not primary_keys: LOGGER.info(f"No primary key for {catalog_entry.table}, using full table replication") replication_method = "FULL_TABLE" LOGGER.info(f"Table {catalog_entry.table} will use {replication_method} sync") database_name = common.get_database_name(catalog_entry) with metrics.job_timer("sync_table") as timer: timer.tags["database"] = database_name timer.tags["table"] = catalog_entry.table if replication_method == "INCREMENTAL": LOGGER.info(f"syncing {catalog_entry.table} incrementally") do_sync_incremental(mssql_conn, config, catalog_entry, state, columns) elif replication_method == "FULL_TABLE": LOGGER.info(f"syncing {catalog_entry.table} full table") do_sync_full_table(mssql_conn, config, catalog_entry, state, columns) else: raise Exception("only INCREMENTAL and FULL TABLE replication methods are supported") state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync_incremental(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) md_map = metadata.to_map(catalog_entry.metadata) replication_key = md_map.get((), {}).get('replication-key') if not replication_key: raise Exception("Cannot use INCREMENTAL replication for table ({}) without a replication key.".format(catalog_entry.stream)) write_schema_message(catalog_entry=catalog_entry, bookmark_properties=[replication_key]) incremental.sync_table(mysql_conn, catalog_entry, state, columns) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync(conn_config, catalog, default_replication_method, state): streams = list(filter(is_selected_via_metadata, catalog.streams)) streams.sort(key=lambda s: s.tap_stream_id) currently_syncing = singer.get_currently_syncing(state) if currently_syncing: streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing, streams) for stream in streams: md_map = metadata.to_map(stream.metadata) conn_config['dbname'] = md_map.get(()).get('database-name') state = singer.set_currently_syncing(state, stream.tap_stream_id) desired_columns = [ c for c in stream.schema.properties.keys() if should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( 'There are no columns selected for stream %s, skipping it', stream.tap_stream_id) continue replication_method = md_map.get( (), {}).get('replication-method', default_replication_method) if replication_method == 'LOG_BASED' and md_map.get( (), {}).get('is-view'): LOGGER.warning( 'Logical Replication is NOT supported for views. skipping stream %s', stream.tap_stream_id) continue if replication_method == 'LOG_BASED': state = do_sync_logical_replication(conn_config, stream, state, desired_columns, md_map) elif replication_method == 'FULL_TABLE': state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception( "only LOG_BASED and FULL_TABLE are supported right now :)") state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_non_oplog_streams(client, streams, state): for stream in streams: md_map = metadata.to_map(stream['metadata']) stream_metadata = md_map.get(()) select_clause = stream_metadata.get('custom-select-clause') if not select_clause: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', stream['tap_stream_stream']) continue columns = [c.strip(' ') for c in select_clause.split(',')] columns.append('_id') state = singer.set_currently_syncing(state, stream['tap_stream_id']) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) replication_method = stream_metadata.get('replication-method') database_name = get_database_name(stream) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': do_sync_historical_oplog(client, stream, state, columns) elif replication_method == 'FULL_TABLE': write_schema_message(stream) stream_version = common.get_stream_version( stream['tap_stream_id'], state) full_table.sync_table(client, stream, state, stream_version, columns) state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) else: raise Exception( f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_non_binlog_streams(mysql_conn, non_binlog_catalog, config, state, original_state_file=''): for catalog_entry in non_binlog_catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') database_name = common.get_database_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table log_engine(mysql_conn, catalog_entry) if replication_method == 'INCREMENTAL': do_sync_incremental(mysql_conn, catalog_entry, state, columns, original_state_file) elif replication_method == 'LOG_BASED': do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(mysql_conn, config, catalog_entry, state, columns) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def consume_message(streams, state, msg, time_extracted, conn_info): payload = json.loads(msg.payload) lsn = msg.data_start streams_lookup = {} for s in streams: streams_lookup[s['tap_stream_id']] = s for c in payload['change']: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) if streams_lookup.get(tap_stream_id) is None: continue target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) if c['kind'] == 'insert': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'update': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'delete': col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at'] col_vals = c['oldkeys']['keyvalues'] + [singer.utils.strftime(time_extracted)] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format(c['kind'])) sync_common.send_schema_message(target_stream, ['lsn']) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) LOGGER.debug("sending feedback to server with NO flush_lsn. just a keep-alive") msg.cursor.send_feedback() LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start) msg.cursor.send_feedback(flush_lsn=msg.data_start) return state
def sync_table(snowflake_conn, catalog_entry, state, columns): """Sync table incrementally""" common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) singer.write_message(activate_version_message) with snowflake_conn.connect_with_backoff() as open_conn: with open_conn.cursor() as cur: if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse( replication_key_value) select_sql = common.generate_sql_query( catalog_entry, columns, bookmark_value=replication_key_value) params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
def do_sync_full_table(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version") state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete", True ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_traditional_stream(client: MongoClient, stream: Dict, state: Dict): """ Sync given stream Args: client: MongoDb client instance stream: stream to sync state: state """ tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') if replication_method not in {INCREMENTAL_METHOD, FULL_TABLE_METHOD}: raise InvalidReplicationMethodException(replication_method, 'replication method needs to be either FULL_TABLE or INCREMENTAL') database_name = metadata.get(md_map, (), 'database-name') # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] collection = client[database_name][stream["table_name"]] if replication_method == 'FULL_TABLE': full_table.sync_collection(collection, stream, state) else: incremental.sync_collection(collection, stream, state) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id, _ in binlog_streams_map.items(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = None try: slave_uuid = f"bi-reader-%04x" % random.getrandbits(64) reader = BinLogStreamReader( connection_settings={}, server_id=server_id, slave_uuid=slave_uuid, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper, ) LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config) finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def for_each_invoice(invoice, time_extracted, stream_version=None): def map_invoice_message(message): message['invoice_id'] = invoice['id'] return message def map_invoice_payment(payment): payment['invoice_id'] = invoice['id'] payment['payment_gateway_id'] = payment['payment_gateway']['id'] payment['payment_gateway_name'] = payment['payment_gateway'][ 'name'] return payment # Sync invoice messages sync_endpoint("invoice_messages", endpoint=("invoices/{}/messages".format(invoice['id'])), path="invoice_messages", with_updated_since=False, map_handler=map_invoice_message, stream_version=stream_version) # Sync invoice payments sync_endpoint("invoice_payments", endpoint=("invoices/{}/payments".format(invoice['id'])), path="invoice_payments", with_updated_since=False, map_handler=map_invoice_payment, date_fields=["send_reminder_on"], stream_version=stream_version) # Extract all invoice_line_items line_items_schema = load_and_write_schema("invoice_line_items") with Transformer() as transformer: for line_item in invoice['line_items']: line_item['invoice_id'] = invoice['id'] if line_item['project'] is not None: line_item['project_id'] = line_item['project']['id'] else: line_item['project_id'] = None line_item = transformer.transform(line_item, line_items_schema) new_record = singer.RecordMessage( stream="invoice_line_items", record=line_item, version=stream_version, time_extracted=time_extracted) singer.write_message(new_record)