def get_binlog_streams(mssql_conn, catalog, config, state): discovered = discover_catalog(mssql_conn, config) selected_streams = list(filter(lambda s: common.stream_is_selected(s), catalog.streams)) binlog_streams = [] for stream in selected_streams: stream_metadata = metadata.to_map(stream.metadata) replication_method = stream_metadata.get((), {}).get("replication-method") stream_state = state.get("bookmarks", {}).get(stream.tap_stream_id) return resolve_catalog(discovered, binlog_streams)
def get_non_binlog_streams(mssql_conn, catalog, config, state): """Returns the Catalog of data we're going to sync for all SELECT-based streams (i.e. INCREMENTAL, FULL_TABLE, and LOG_BASED that require a historical sync). LOG_BASED streams that require a historical sync are inferred from lack of any state. Using the Catalog provided from the input file, this function will return a Catalog representing exactly which tables and columns that will be emitted by SELECT-based syncs. This is achieved by comparing the input Catalog to a freshly discovered Catalog to determine the resulting Catalog. The resulting Catalog will include the following any streams marked as "selected" that currently exist in the database. Columns marked as "selected" and those labled "automatic" (e.g. primary keys and replication keys) will be included. Streams will be prioritized in the following order: 1. currently_syncing if it is SELECT-based 2. any streams that do not have state 3. any streams that do not have a replication method of LOG_BASED """ mssql_conn = MSSQLConnection(config) discovered = discover_catalog(mssql_conn, config) # Filter catalog to include only selected streams selected_streams = list( filter(lambda s: common.stream_is_selected(s), catalog.streams)) streams_with_state = [] streams_without_state = [] for stream in selected_streams: stream_metadata = metadata.to_map(stream.metadata) # if stream_metadata.table in ["aagaggpercols", "aagaggdef"]: for k, v in stream_metadata.get((), {}).items(): LOGGER.info(f"{k}: {v}") # LOGGER.info(stream_metadata.get((), {}).get("table-key-properties")) replication_method = stream_metadata.get((), {}).get("replication-method") stream_state = state.get("bookmarks", {}).get(stream.tap_stream_id) if not stream_state: streams_without_state.append(stream) else: streams_with_state.append(stream) # If the state says we were in the middle of processing a stream, skip # to that stream. Then process streams without prior state and finally # move onto streams with state (i.e. have been synced in the past) currently_syncing = singer.get_currently_syncing(state) # prioritize streams that have not been processed ordered_streams = streams_without_state + streams_with_state if currently_syncing: currently_syncing_stream = list( filter( lambda s: s.tap_stream_id == currently_syncing and is_valid_currently_syncing_stream(s, state), streams_with_state, )) non_currently_syncing_streams = list( filter(lambda s: s.tap_stream_id != currently_syncing, ordered_streams)) streams_to_sync = currently_syncing_stream + non_currently_syncing_streams else: # prioritize streams that have not been processed streams_to_sync = ordered_streams return resolve_catalog(discovered, streams_to_sync)