def sync(config, state, catalog): for catalog_entry in catalog.streams: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get('replication-key') state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) _emit(singer.StateMessage(value=state)) if catalog_entry.is_view: key_properties = metadata.to_map(catalog_entry.metadata).get( (), {}).get('view-key-properties', []) else: key_properties = metadata.to_map(catalog_entry.metadata).get( (), {}).get('table-key-properties', []) _emit( singer.SchemaMessage(stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=key_properties, bookmark_properties=replication_key)) with metrics.job_timer("sync_table") as timer: timer.tags["schema"] = catalog_entry.database timer.tags["table"] = catalog_entry.table _sync_table(config, state, catalog_entry) state = singer.set_currently_syncing(state, None) _emit(singer.StateMessage(value=state))
def wait_for_export(self, stream_type, export_id): # Poll the export status until it enters a finalized state or # exceeds the job timeout time. with metrics.job_timer('Export {} for {}'.format( export_id, stream_type)): timeout_time = pendulum.utcnow().add(seconds=self.job_timeout) while pendulum.utcnow() < timeout_time: status = self.poll_export(stream_type, export_id) singer.log_info("export %s status is %s", export_id, status) if status == "Created": # If the status is created, the export has been made but # not started, so enqueue the export. self.enqueue_export(stream_type, export_id) elif status in ["Cancelled", "Failed"]: # Cancelled and failed exports fail the current sync. raise ExportFailed(status) elif status == "Completed": return True time.sleep(self.poll_interval) raise ExportFailed("Export timed out after {} minutes".format( self.job_timeout / 60))
def sync_binlog_streams(mysql_conn, binlog_catalog, config, state): if binlog_catalog.streams: for stream in binlog_catalog.streams: write_schema_message(stream) with metrics.job_timer('sync_binlog') as timer: binlog.sync_binlog_stream(mysql_conn, config, binlog_catalog.streams, state)
async def poll_report(client, account_id, report_name, start_date, end_date, request_id): download_url = None with metrics.job_timer('generate_report'): for i in range(1, MAX_NUM_REPORT_POLLS + 1): LOGGER.info('Polling report job {}/{} - {} - from {} to {}'.format( i, MAX_NUM_REPORT_POLLS, report_name, start_date, end_date)) response = client.PollGenerateReport(request_id) if response.Status == 'Error': LOGGER.warn( 'Error polling {} for account {} with request id {}' .format(report_name, account_id, request_id)) return False, None if response.Status == 'Success': if response.ReportDownloadUrl: download_url = response.ReportDownloadUrl else: LOGGER.info('No results for report: {} - from {} to {}'.format( report_name, start_date, end_date)) break if i == MAX_NUM_REPORT_POLLS: LOGGER.info('Generating report timed out: {} - from {} to {}'.format( report_name, start_date, end_date)) else: await asyncio.sleep(REPORT_POLL_SLEEP) return True, download_url
async def poll_report(client, account_id, report_name, start_date, end_date, request_id): # Get download_url of generated report download_url = None with metrics.job_timer('generate_report'): for i in range(1, MAX_NUM_REPORT_POLLS + 1): LOGGER.info('Polling report job %s/%s - %s - from %s to %s', i, MAX_NUM_REPORT_POLLS, report_name, start_date, end_date) # As in the async method backoff does not work directly we created a separate method to handle it. response = generate_poll_report(client, request_id) if response.Status == 'Error': LOGGER.warn( 'Error polling %s for account %s with request id %s', report_name, account_id, request_id) return False, None if response.Status == 'Success': if response.ReportDownloadUrl: download_url = response.ReportDownloadUrl else: LOGGER.info('No results for report: %s - from %s to %s', report_name, start_date, end_date) break if i == MAX_NUM_REPORT_POLLS: LOGGER.info('Generating report timed out: %s - from %s to %s', report_name, start_date, end_date) else: await asyncio.sleep(REPORT_POLL_SLEEP) return True, download_url
def sync_stream(mssql_conn, catalog, config, state): for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') database_name = common.get_database_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table if replication_method == 'INCREMENTAL': optional_limit = config.get('incremental_limit') # do_sync_incremental(mssql_conn, catalog_entry, state, columns, optional_limit) elif replication_method == 'LOG_BASED': pass # do_sync_historical_binlog(mssql_conn, config, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(mssql_conn, config, catalog_entry, state, columns)
def generate_messages(conn, db_schema, catalog, state): catalog = resolve.resolve_catalog(discover_catalog(conn, db_schema), catalog, state) for catalog_entry in catalog.streams: state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) catalog_md = metadata.to_map(catalog_entry.metadata) if catalog_md.get((), {}).get('is-view'): key_properties = catalog_md.get((), {}).get('view-key-properties') else: key_properties = catalog_md.get((), {}).get('table-key-properties') bookmark_properties = catalog_md.get((), {}).get('replication-key') # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) # Emit a SCHEMA message before we sync any records yield singer.SchemaMessage(stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=key_properties, bookmark_properties=bookmark_properties) # Emit a RECORD message for each record in the result set with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table for message in sync_table(conn, catalog_entry, state): yield message # If we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))
def __iter__(self): for params in self.job_params(): with metrics.job_timer('insights'): job = self.run_job(params) min_date_start_for_job = None count = 0 for obj in job.get_result(): count += 1 rec = obj.export_all_data() if not min_date_start_for_job or rec[ 'date_stop'] < min_date_start_for_job: min_date_start_for_job = rec['date_stop'] yield {'record': rec} LOGGER.info('Got %d results for insights job', count) # when min_date_start_for_job stays None, we should # still update the bookmark using 'until' in time_ranges if min_date_start_for_job is None: for time_range in params['time_ranges']: if time_range['until']: min_date_start_for_job = time_range['until'] yield { 'state': advance_bookmark(self, self.bookmark_key, min_date_start_for_job) } # pylint: disable=no-member
def poll_report(client, report_name, start_date, end_date, request_id): download_url = None with metrics.job_timer('generate_report'): for i in range(1, MAX_NUM_REPORT_POLLS + 1): LOGGER.info('Polling report job {}/{} - {} - from {} to {}'.format( i, MAX_NUM_REPORT_POLLS, report_name, start_date, end_date)) response = client.PollGenerateReport(request_id) if response.Status == 'Error': raise Exception('Error running {} report'.format(report_name)) if response.Status == 'Success': if response.ReportDownloadUrl: download_url = response.ReportDownloadUrl else: LOGGER.info( 'No results for report: {} - from {} to {}'.format( report_name, start_date, end_date)) break if i == MAX_NUM_REPORT_POLLS: LOGGER.info( 'Generating report timed out: {} - from {} to {}'.format( report_name, start_date, end_date)) else: time.sleep(REPORT_POLL_SLEEP) return download_url
def sync_oplog_streams(client, streams, state): if streams: for stream in streams: write_schema_message(stream) with metrics.job_timer('sync_oplog') as timer: oplog.sync_oplog_stream(client, streams, state)
def __sync_endpoint(req_state): # Top level variables endpoint_total = 0 with metrics.job_timer('endpoint_duration'): LOGGER.info('{}: STARTED Syncing stream'.format(req_state.stream_name)) singer_ops.update_currently_syncing(req_state.state, req_state.stream_name) # Publish schema to singer singer_ops.write_schema(req_state.catalog, req_state.stream_name) LOGGER.info('{}: Processing date window, {} to {}'.format( req_state.stream_name, req_state.last_date, req_state.end_date)) if req_state.stream_name in ALL_RECORDS_STREAMS: endpoint_total = __process_all_records_data_stream(req_state) elif req_state.stream_name == 'periodic_data_standardized': endpoint_total = __process_standardized_data_stream(req_state) elif req_state.stream_name == 'periodic_data_calculated': endpoint_total = __process_periodic_data_calcs(req_state) else: # data_items, investment_transactions endpoint_total = __process_incremental_stream(req_state) singer_ops.update_currently_syncing(req_state.state, None) LOGGER.info('{}: FINISHED Syncing Stream, total_records: {}'.format( req_state.stream_name, endpoint_total)) LOGGER.info('sync.py: sync complete') return endpoint_total
def sync_non_binlog_streams(mssql_conn, non_binlog_catalog, config, state): mssql_conn = MSSQLConnection(config) for catalog_entry in non_binlog_catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( "There are no columns selected for stream %s, skipping it.", catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get("replication-method") replication_key = md_map.get((), {}).get("replication-key") primary_keys = md_map.get((), {}).get("table-key-properties") LOGGER.info( f"Table {catalog_entry.table} proposes {replication_method} sync") if replication_method == "INCREMENTAL" and not replication_key: LOGGER.info( f"No replication key for {catalog_entry.table}, using full table replication" ) replication_method = "FULL_TABLE" if replication_method == "INCREMENTAL" and not primary_keys: LOGGER.info( f"No primary key for {catalog_entry.table}, using full table replication" ) replication_method = "FULL_TABLE" LOGGER.info( f"Table {catalog_entry.table} will use {replication_method} sync") database_name = common.get_database_name(catalog_entry) with metrics.job_timer("sync_table") as timer: timer.tags["database"] = database_name timer.tags["table"] = catalog_entry.table if replication_method == "INCREMENTAL": LOGGER.info(f"syncing {catalog_entry.table} incrementally") do_sync_incremental(mssql_conn, config, catalog_entry, state, columns) elif replication_method == "FULL_TABLE": LOGGER.info(f"syncing {catalog_entry.table} full table") do_sync_full_table(mssql_conn, config, catalog_entry, state, columns) else: raise Exception( "only INCREMENTAL and FULL TABLE replication methods are supported" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync_incremental(con, catalog_entry, state, columns): LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) key_properties = get_key_properties(catalog_entry) if not catalog_entry.replication_key: raise Exception("Cannot use INCREMENTAL replication for table ({}) without a replication key.".format(catalog_entry.stream)) singer.write_schema(catalog_entry.stream, catalog_entry.schema.to_dict(), key_properties, [catalog_entry.replication_key]) with metrics.job_timer('sync_table') as timer: timer.tags['table'] = catalog_entry.table sync_incremental.sync_table(con, catalog_entry, state, columns)
def generate_messages(con, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') # Emit a SCHEMA message before we sync any records yield singer.SchemaMessage( stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=key_properties, bookmark_properties=replication_key ) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': for message in incremental.sync_table(con, catalog_entry, state): yield message elif replication_method == 'FULL_TABLE': for message in full_table.sync_table(con, catalog_entry, state): yield message else: raise Exception("only INCREMENTAL and FULL TABLE replication methods are supported") # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_non_oplog_streams(client, streams, state): for stream in streams: md_map = metadata.to_map(stream['metadata']) stream_metadata = md_map.get(()) select_clause = stream_metadata.get('custom-select-clause') if not select_clause: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', stream['tap_stream_stream']) continue columns = [c.strip(' ') for c in select_clause.split(',')] columns.append('_id') state = singer.set_currently_syncing(state, stream['tap_stream_id']) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) replication_method = stream_metadata.get('replication-method') database_name = get_database_name(stream) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': do_sync_historical_oplog(client, stream, state, columns) elif replication_method == 'FULL_TABLE': write_schema_message(stream) stream_version = common.get_stream_version( stream['tap_stream_id'], state) full_table.sync_table(client, stream, state, stream_version, columns) state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) else: raise Exception( f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_non_binlog_streams(mysql_conn, non_binlog_catalog, config, state, original_state_file=''): for catalog_entry in non_binlog_catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') database_name = common.get_database_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table log_engine(mysql_conn, catalog_entry) if replication_method == 'INCREMENTAL': do_sync_incremental(mysql_conn, catalog_entry, state, columns, original_state_file) elif replication_method == 'LOG_BASED': do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(mysql_conn, config, catalog_entry, state, columns) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_traditional_stream(client: MongoClient, stream: Dict, state: Dict): """ Sync given stream Args: client: MongoDb client instance stream: stream to sync state: state """ tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') if replication_method not in {INCREMENTAL_METHOD, FULL_TABLE_METHOD}: raise InvalidReplicationMethodException(replication_method, 'replication method needs to be either FULL_TABLE or INCREMENTAL') database_name = metadata.get(md_map, (), 'database-name') # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] collection = client[database_name][stream["table_name"]] if replication_method == 'FULL_TABLE': full_table.sync_collection(collection, stream, state) else: incremental.sync_collection(collection, stream, state) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_streams(snowflake_conn, catalog, state): for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method', "FULL_TABLE") database_name = common.get_database_name(catalog_entry) schema_name = common.get_schema_name(catalog_entry) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = catalog_entry.table LOGGER.info('Beginning to sync %s.%s.%s', database_name, schema_name, catalog_entry.table) if replication_method == 'INCREMENTAL': do_sync_incremental(snowflake_conn, catalog_entry, state, columns) elif replication_method == 'FULL_TABLE': do_sync_full_table(snowflake_conn, catalog_entry, state, columns) else: raise Exception( 'Only INCREMENTAL and FULL TABLE replication methods are supported' ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_log_based_streams(client: MongoClient, log_based_streams: List[Dict], database_name: str, state: Dict, update_buffer_size: Optional[int], await_time_ms: Optional[int] ): """ Sync log_based streams all at once by listening on the database-level change streams events. Args: client: MongoDB client instance log_based_streams: list of streams to sync database_name: name of the database to sync from state: state dictionary update_buffer_size: the size of buffer used to hold detected updates await_time_ms: the maximum time in milliseconds for the log based to wait for changes before exiting """ if not log_based_streams: return streams = streams_list_to_dict(log_based_streams) for tap_stream_id, stream in streams.items(): common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 state = clear_state_on_replication_change(stream, state) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name update_buffer_size = update_buffer_size or change_streams.MIN_UPDATE_BUFFER_LENGTH await_time_ms = await_time_ms or change_streams.DEFAULT_AWAIT_TIME_MS change_streams.sync_database(client[database_name], streams, state, update_buffer_size, await_time_ms) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def generate_messages(conn, catalog, state): for catalog_entry in catalog.streams: # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) # Emit a SCHEMA message before we sync any records yield singer.SchemaMessage( stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=catalog_entry.key_properties ) # Emit a RECORD message for each record in the result set with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table for message in sync_table(conn, catalog_entry, state): yield message # Emit a state message yield singer.StateMessage(value=copy.deepcopy(state))
def generate_messages(conn, catalog, state): for catalog_entry in catalog.streams: if not catalog_entry.is_selected(): continue # Emit a SCHEMA message before we sync any records yield singer.SchemaMessage( stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=catalog_entry.key_properties, bookmark_properties=[REPLICATION_KEY] ) # Emit a RECORD message for each record in the result set with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table for message in sync_table(conn, catalog_entry, state): yield message # Emit a state message yield singer.StateMessage(value=copy.deepcopy(state))
def sync_non_oplog_streams(client, streams, state): for stream in streams: md_map = metadata.to_map(stream['metadata']) stream_metadata = md_map.get(()) blacklisted_fields = stream_metadata.get('blacklisted-fields') blacklist = [c.strip(' ') for c in blacklisted_fields.split(',')] state = singer.set_currently_syncing(state, stream['tap_stream_id']) # Emit a state message to indicate that we've started this stream singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) replication_method = stream_metadata.get('replication-method') database_name = get_database_name(stream) with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': do_sync_historical_oplog(client, stream, state, blacklist) elif replication_method == 'FULL_TABLE': write_schema_message(stream) stream_version = common.get_stream_version( stream['tap_stream_id'], state) full_table.sync_table(client, stream, state, stream_version, blacklist) else: raise Exception( f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})" ) state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def generate_messages(con, config, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = md_map.get((), {}).get('replication-key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, [replication_key]) for message in incremental.sync_table(con, catalog_entry, state, columns): yield message elif replication_method == 'LOG_BASED': if catalog_entry.is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view." .format(catalog_entry.stream)) LOGGER.info("Stream %s is using binlog replication", catalog_entry.stream) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') yield generate_schema_message(catalog_entry, key_properties, []) if log_file and log_pos: columns = binlog.add_automatic_properties( catalog_entry, columns) for message in binlog.sync_table(con, config, catalog_entry, state, columns): yield message else: LOGGER.info("Performing initial full table sync") log_file, log_pos = binlog.fetch_current_log_file_and_pos( con) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) for message in full_table.sync_table( con, catalog_entry, state, columns, stream_version): yield message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', log_pos) yield singer.StateMessage(value=copy.deepcopy(state)) elif replication_method == 'FULL_TABLE': LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, []) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) for message in full_table.sync_table(con, catalog_entry, state, columns, stream_version): yield message # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) yield singer.StateMessage(value=copy.deepcopy(state)) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_bulk_obj(client, catalog, state, start_date, stream_name, bulk_page_size, activity_type=None): LOGGER.info('{} - Starting export'.format(stream_name)) stream = catalog.get_stream(stream_name) if activity_type: updated_at_field = 'CreatedAt' else: updated_at_field = 'UpdatedAt' last_bookmark = get_bulk_bookmark(state, stream_name) last_date_raw = last_bookmark.get('datetime', start_date) last_date = pendulum.parse(last_date_raw).to_datetime_string() last_sync_id = last_bookmark.get('sync_id') last_offset = last_bookmark.get('offset') if last_sync_id: LOGGER.info('{} - Resuming previous export: {}'.format( stream_name, last_sync_id)) try: last_date = stream_export(client, state, catalog, stream_name, last_sync_id, updated_at_field, bulk_page_size, last_date, offset=last_offset) except HTTPError as e: if e.response.status_code in [404, 410]: LOGGER.info('{} - Previous export expired: {}'.format( stream_name, last_sync_id)) else: raise fields = {} obj_meta = None for meta in stream.metadata: if not meta['breadcrumb']: obj_meta = meta['metadata'] elif meta['metadata'].get('selected', True) or \ meta['metadata'].get('inclusion', 'available') == 'automatic': field_name = meta['breadcrumb'][1] fields[field_name] = meta['metadata']['tap-eloqua.statement'] num_fields = len(fields.values()) if num_fields > 250: LOGGER.error( '{} - Exports can only have 250 fields selected. {} are selected.'. format(stream_name, num_fields)) else: LOGGER.info('{} - Syncing {} fields'.format(stream_name, num_fields)) language_obj = obj_meta['tap-eloqua.query-language-name'] _filter = "'{{" + language_obj + "." + updated_at_field + "}}' >= '" + last_date + "'" if activity_type is not None: _filter += " AND '{{Activity.Type}}' = '" + activity_type + "'" params = { 'name': 'Singer Sync - ' + datetime.utcnow().isoformat(), 'fields': fields, 'filter': _filter, 'areSystemTimestampsInUTC': True } if activity_type: url_obj = 'activities' elif obj_meta['tap-eloqua.id']: url_obj = 'customObjects/' + obj_meta['tap-eloqua.id'] else: url_obj = stream_name with metrics.job_timer('bulk_export'): data = client.post('/api/bulk/2.0/{}/exports'.format(url_obj), json=params, endpoint='export_create_def') data = client.post('/api/bulk/2.0/syncs', json={'syncedInstanceUri': data['uri']}, endpoint='export_create_sync') sync_id = re.match(r'/syncs/([0-9]+)', data['uri']).groups()[0] LOGGER.info('{} - Created export - {}'.format(stream_name, sync_id)) sleep = 0 start_time = time.time() while True: data = client.get('/api/bulk/2.0/syncs/{}'.format(sync_id), endpoint='export_sync_poll') status = data['status'] if status == 'success': break elif status not in ['pending', 'active']: message = '{} - status: {}, exporting failed'.format( stream_name, status) LOGGER.error(message) raise Exception(message) elif (time.time() - start_time) > MAX_RETRY_ELAPSED_TIME: message = '{} - export deadline exceeded ({} secs)'.format( stream_name, MAX_RETRY_ELAPSED_TIME) LOGGER.error(message) raise Exception(message) sleep = next_sleep_interval(sleep) LOGGER.info('{} - status: {}, sleeping for {} seconds'.format( stream_name, status, sleep)) time.sleep(sleep) stream_export(client, state, catalog, stream_name, sync_id, updated_at_field, bulk_page_size, last_date)
def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata): """ Write all `table_batch`s associated with the given `schema` and `records` to remote. :param connection: remote connection, type left to be determined by implementing class :param root_table_name: string :param schema: SingerStreamSchema :param key_properties: [string, ...] :param records: [{...}, ...] :param metadata: additional metadata needed by implementing class :return: {'records_persisted': int, 'rows_persisted': int} """ with self._set_timer_tags(metrics.job_timer(), 'batch', (root_table_name,)): with self._set_counter_tags(metrics.record_counter(None), 'batch_rows_persisted', (root_table_name,)) as batch_counter: self.LOGGER.info('Writing batch with {} records for `{}` with `key_properties`: `{}`'.format( len(records), root_table_name, key_properties )) for table_batch in denest.to_table_batches(schema, key_properties, records): table_batch['streamed_schema']['path'] = (root_table_name,) + \ table_batch['streamed_schema']['path'] with self._set_timer_tags(metrics.job_timer(), 'table', table_batch['streamed_schema']['path']) as table_batch_timer: with self._set_counter_tags(metrics.record_counter(None), 'table_rows_persisted', table_batch['streamed_schema']['path']) as table_batch_counter: self.LOGGER.info('Writing table batch schema for `{}`...'.format( table_batch['streamed_schema']['path'] )) remote_schema = self.upsert_table_helper(connection, table_batch['streamed_schema'], metadata) self._set_metrics_tags__table(table_batch_timer, remote_schema['name']) self._set_metrics_tags__table(table_batch_counter, remote_schema['name']) self.LOGGER.info('Writing table batch with {} rows for `{}`...'.format( len(table_batch['records']), table_batch['streamed_schema']['path'] )) batch_rows_persisted = self.write_table_batch( connection, {'remote_schema': remote_schema, 'records': self._serialize_table_records(remote_schema, table_batch['streamed_schema'], table_batch['records'])}, metadata) table_batch_counter.increment(batch_rows_persisted) batch_counter.increment(batch_rows_persisted) return { 'records_persisted': len(records), 'rows_persisted': batch_counter.value }
def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations, :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] with self._set_timer_tags(metrics.job_timer(), 'upsert_table_schema', table_path) as timer: _metadata = deepcopy(metadata) _metadata['schema_version'] = CURRENT_SCHEMA_VERSION table_name = self.add_table_mapping(connection, table_path, _metadata) self._set_metrics_tags__table(timer, table_name) existing_schema = self._get_table_schema(connection, table_name) existing_table = True if existing_schema is None: self.add_table(connection, table_path, table_name, _metadata) existing_schema = self._get_table_schema(connection, table_name) existing_table = False self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Build up mappings to compare new columns against existing mappings = [] for to, m in existing_schema.get('mappings', {}).items(): mapping = json_schema.simple_type(m) mapping['from'] = tuple(m['from']) mapping['to'] = to mappings.append(mapping) ## Only process columns which have single, nullable, types column_paths_seen = set() single_type_columns = [] for column_path, column_schema in schema['schema']['properties'].items(): column_paths_seen.add(column_path) for sub_schema in column_schema['anyOf']: single_type_columns.append((column_path, deepcopy(sub_schema))) ### Add any columns missing from new schema for m in mappings: if not m['from'] in column_paths_seen: single_type_columns.append((m['from'], json_schema.make_nullable(m))) ## Process new columns against existing table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: upsert_table_helper__start__column = time.monotonic() canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable(column_schema) def log_message(msg): if log_schema_changes: self.LOGGER.info( 'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format( table_name, column_path, canonicalized_column_name, msg, _duration_millis(upsert_table_helper__start__column))) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: upsert_table_helper__column = "New column" ### NON EMPTY TABLE if not table_empty: upsert_table_helper__column += ", non empty table" self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format( column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mapping = json_schema.simple_type(column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message(upsert_table_helper__column) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [m for m in mappings if m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema)] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema))] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message("Made existing column nullable.") continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [m for m in mappings if m['from'] == column_path] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [m for m in mappings if m['from'] != column_path] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path, existing_mapping, mappings) mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping)) mapping['from'] = column_path mapping['to'] = existing_column_new_normalized_name mappings.append(mapping) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping(connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column(connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format( existing_column_name, existing_column_new_normalized_name, canonicalized_column_name ) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format( column_path ) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format( column_path, canonicalized_column_name, table_name )) log_message(upsert_table_helper__column) if not existing_table: for column_names in self.new_table_indexes(schema): self.add_index(connection, table_name, column_names) return self._get_table_schema(connection, table_name)
def sync_bulk_obj(client, catalog, state, start_date, stream_name, bulk_page_size, activity_type=None, end_date=None): LOGGER.info('{} - Starting export'.format(stream_name)) stream = catalog.get_stream(stream_name) if activity_type: updated_at_field_name = 'ActivityDate' updated_at_field = 'CreatedAt' else: updated_at_field = 'UpdatedAt' updated_at_field_name = updated_at_field last_bookmark = get_bulk_bookmark(state, stream_name) last_date_raw = last_bookmark.get('datetime', start_date) last_date = pendulum.parse(last_date_raw).to_datetime_string() last_sync_id = last_bookmark.get('sync_id') last_offset = last_bookmark.get('offset') if last_sync_id: LOGGER.info('{} - Resuming previous export: {}'.format(stream_name, last_sync_id)) try: last_date = stream_export(client, state, catalog, stream_name, last_sync_id, updated_at_field_name, bulk_page_size, last_date, offset=last_offset) except HTTPError as e: if e.response.status_code in [404, 410]: LOGGER.info('{} - Previous export expired: {}'.format(stream_name, last_sync_id)) else: raise fields = {} obj_meta = None for meta in stream.metadata: if not meta['breadcrumb']: obj_meta = meta['metadata'] elif meta['metadata'].get('selected', True) or \ meta['metadata'].get('inclusion', 'available') == 'automatic': field_name = meta['breadcrumb'][1] fields[field_name] = meta['metadata']['tap-eloqua.statement'] num_fields = len(fields.values()) if num_fields > 250: raise Exception('{} - Exports can only have 250 fields selected. {} are selected.'.format( stream_name, num_fields)) else: LOGGER.info('{} - Syncing {} fields'.format(stream_name, num_fields)) language_obj = obj_meta['tap-eloqua.query-language-name'] _filter = "'{{" + language_obj + "." + updated_at_field + "}}' >= '" + last_date + "'" if end_date: _filter += " AND '{{" + language_obj + "." + updated_at_field + "}}' < '" + end_date.to_datetime_string() + "'" if activity_type is not None: _filter += " AND '{{Activity.Type}}' = '" + activity_type + "'" # NB: We observed shuffled data when Activity.CreatedAt was specified twice in the query. # The key 'CreatedAt' is synthetic, so add it in after the export. (PR #19) fields.pop('CreatedAt', None) params = { 'name': 'Singer Sync - ' + datetime.utcnow().isoformat(), 'fields': fields, 'filter': _filter, 'areSystemTimestampsInUTC': True, 'autoDeleteDuration': 'P3D', 'dataRetentionDuration': 'P3D' # 3 days in ISO-8601 duration notation this can be increased up to 14 days } if activity_type: url_obj = 'activities' elif obj_meta['tap-eloqua.id']: url_obj = 'customObjects/' + obj_meta['tap-eloqua.id'] else: url_obj = stream_name with metrics.job_timer('bulk_export'): log_string = "{} - Creating bulk export from {}".format(stream_name, last_date) if end_date: log_string += " to {}".format(end_date.to_datetime_string()) LOGGER.info(log_string) data = client.post( '/api/bulk/2.0/{}/exports'.format(url_obj), json=params, endpoint='export_create_def') data = client.post( '/api/bulk/2.0/syncs', json={ 'syncedInstanceUri': data['uri'] }, endpoint='export_create_sync') sync_id = re.match(r'/syncs/([0-9]+)', data['uri']).groups()[0] LOGGER.info('{} - Created export - {}'.format(stream_name, sync_id)) write_bulk_bookmark(state, stream_name, sync_id, 0, last_date_raw) sleep = 0 start_time = time.time() while True: data = client.get( '/api/bulk/2.0/syncs/{}'.format(sync_id), endpoint='export_sync_poll') status = data['status'] if status == 'success': break elif status not in ['pending', 'active']: message = '{} - status: {}, exporting failed'.format( stream_name, status) LOGGER.error(message) raise Exception(message) elif (time.time() - start_time) > MAX_RETRY_ELAPSED_TIME: message = '{} - export deadline exceeded ({} secs)'.format( stream_name, MAX_RETRY_ELAPSED_TIME) LOGGER.error(message) raise Exception(message) sleep = next_sleep_interval(sleep) LOGGER.info('{} - status: {}, sleeping for {} seconds'.format( stream_name, status, sleep)) time.sleep(sleep) # Check record count data = client.get( '/api/bulk/2.0/syncs/{}/logs'.format(sync_id), endpoint='export_sync_poll') success_message = "Successfully exported members to csv file." export_success_log = next((i for i in data.get("items", []) if i.get("message") == success_message), None) if export_success_log: record_count = export_success_log["count"] LOGGER.info("Sync id {} contains {} records.".format(sync_id, record_count)) if activity_type and record_count >= 5000000: raise ActivityExportTooLarge("Export too large, retrying with smaller window.") stream_export(client, state, catalog, stream_name, sync_id, updated_at_field_name, bulk_page_size, last_date, activity_type=activity_type)
def sync_stream(client, stream, state): tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') database_name = metadata.get(md_map, (), 'database-name') stream_projection = load_stream_projection(stream) # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': if oplog.oplog_has_aged_out(client, state, tap_stream_id): # remove all state for stream # then it will do a full sync and start oplog again. LOGGER.info("Clearing state because Oplog has aged out") state.get('bookmarks', {}).pop(tap_stream_id) # make sure initial full table sync has been completed if not singer.get_bookmark(state, tap_stream_id, 'initial_full_table_complete'): msg = 'Must complete full table sync before starting oplog replication for %s' LOGGER.info(msg, tap_stream_id) # only mark current ts in oplog on first sync so tap has a # starting point after the full table sync if singer.get_bookmark(state, tap_stream_id, 'version') is None: collection_oplog_ts = oplog.get_latest_ts(client) oplog.update_bookmarks(state, tap_stream_id, collection_oplog_ts) full_table.sync_collection(client, stream, state, stream_projection) oplog.sync_collection(client, stream, state, stream_projection) elif replication_method == 'FULL_TABLE': full_table.sync_collection(client, stream, state, stream_projection) elif replication_method == 'INCREMENTAL': incremental.sync_collection(client, stream, state, stream_projection) else: raise Exception( "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \ methods are supported (you passed {})".format( replication_method)) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))