def log_engine(mysql_conn, catalog_entry): is_view = common.get_is_view(catalog_entry) database_name = common.get_database_name(catalog_entry) if is_view: LOGGER.info("Beginning sync for view %s.%s", database_name, catalog_entry.table) else: with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute( """ SELECT engine FROM information_schema.tables WHERE table_schema = %s AND table_name = %s """, (database_name, catalog_entry.table)) row = cur.fetchone() if row: LOGGER.info("Beginning sync for %s table %s.%s", row[0], database_name, catalog_entry.table)
def sync_non_binlog_streams(mysql_conn, non_binlog_catalog, config, state): for catalog_entry in non_binlog_catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning('There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') database_name = common.get_database_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table log_engine(mysql_conn, catalog_entry) if replication_method == 'INCREMENTAL': optional_limit = config.get('incremental_limit') do_sync_incremental(mysql_conn, catalog_entry, state, columns, optional_limit) elif replication_method == 'LOG_BASED': do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(mysql_conn, config, catalog_entry, state, columns) else: raise Exception("only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported") state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_is_resumable(mysql_conn, catalog_entry): ''' In order to resume a full table sync, a table requires ''' database_name = common.get_database_name(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if not key_properties: return False sql = """SELECT data_type FROM information_schema.columns WHERE table_schema = '{}' AND table_name = '{}' AND column_name = '{}' """ with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: for pk in key_properties: cur.execute(sql.format(database_name, catalog_entry.table, pk)) result = cur.fetchone() if not result: raise Exception( "Primary key column {} does not exist.".format(pk)) if result[0] not in RESUMABLE_PK_TYPES: LOGGER.warn( "Found primary key column %s with type %s. Will not be able " + "to resume interrupted FULL_TABLE sync using this key.", pk, result[0]) return False return True
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get('replication-key') discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning('Database %s table %s was selected but does not exist', database_name, catalog_entry.table) continue selected = {k for k, v in catalog_entry.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key} # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append(CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.stream, table=catalog_entry.table, schema=Schema( type='object', properties={col: discovered_table.schema.properties[col] for col in columns} ) )) return result