def load_table_cache(config): table_cache = [] if not ("disable_table_cache" in config and config["disable_table_cache"]): LOGGER.info("Getting catalog objects from table cache...") db = DbSync(config) table_cache = db.get_table_columns( filter_schemas=get_schema_names_from_config(config)) return table_cache
def persist_lines(config, lines) -> None: state = None flushed_state = None schemas = {} key_properties = {} validators = {} records_to_load = {} csv_files_to_load = {} row_count = {} stream_to_sync = {} total_row_count = {} table_columns_cache = None batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS) # Cache the available schemas, tables and columns from redshift if not disabled in config # The cache will be used later use to avoid lot of small queries hitting redshift if not ('disable_table_cache' in config and config['disable_table_cache'] == True): logger.info("Caching available catalog objects in redshift...") filter_schemas = get_schema_names_from_config(config) table_columns_cache = DbSync(config).get_table_columns( filter_schemas=filter_schemas) # Loop over lines from stdin for line in lines: try: o = json.loads(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if 'type' not in o: raise Exception( "Line is missing required key 'type': {}".format(line)) t = o['type'] if t == 'RECORD': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) if o['stream'] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(o['stream'])) # Get schema for this record's stream stream = o['stream'] # Validate record try: validators[stream].validate(float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations " "that allows long precisions are not supported (i.e. with 15 digits or more). Try removing " "'multipleOf' methods from JSON schema. ".format( o['record'])) raise ex primary_key_string = stream_to_sync[ stream].record_primary_key_string(o['record']) if not primary_key_string: primary_key_string = 'RID-{}'.format(total_row_count[stream]) if stream not in records_to_load: records_to_load[stream] = {} # increment row count only when a new PK is encountered in the current batch if primary_key_string not in records_to_load[stream]: row_count[stream] += 1 total_row_count[stream] += 1 # append record if config.get('add_metadata_columns') or config.get('hard_delete'): records_to_load[stream][ primary_key_string] = add_metadata_values_to_record( o, stream_to_sync[stream]) else: records_to_load[stream][primary_key_string] = o['record'] if row_count[stream] >= batch_size_rows: # flush all streams, delete records if needed, reset counts and then emit current state if config.get('flush_all_streams'): filter_streams = None else: filter_streams = [stream] # Flush and return a new state dict with new positions only for the flushed streams flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams) # emit last encountered state emit_state(copy.deepcopy(flushed_state)) elif t == 'SCHEMA': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) stream = o['stream'] schemas[stream] = o schema = float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) # flush records from previous stream SCHEMA # if same stream has been encountered again, it means the schema might have been altered # so previous records need to be flushed if row_count.get(stream, 0) > 0: flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest encountered state emit_state(flushed_state) # key_properties key must be available in the SCHEMA message. if 'key_properties' not in o: raise Exception("key_properties field is required") # Log based and Incremental replications on tables with no Primary Key # cause duplicates when merging UPDATE events. # Stop loading data by default if no Primary Key. # # If you want to load tables with no Primary Key: # 1) Set ` 'primary_key_required': false ` in the target-redshift config.json # or # 2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.] if config.get('primary_key_required', True) and len( o['key_properties']) == 0: logger.critical( "Primary key is set to mandatory but not defined in the [{}] stream" .format(stream)) raise Exception("key_properties field is required") key_properties[stream] = o['key_properties'] if config.get('add_metadata_columns') or config.get('hard_delete'): stream_to_sync[stream] = DbSync( config, add_metadata_columns_to_schema(o)) else: stream_to_sync[stream] = DbSync(config, o) stream_to_sync[stream].create_schema_if_not_exists( table_columns_cache) stream_to_sync[stream].sync_table(table_columns_cache) row_count[stream] = 0 total_row_count[stream] = 0 csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b') elif t == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') elif t == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] # Initially set flushed state if not flushed_state: flushed_state = copy.deepcopy(state) else: raise Exception("Unknown message type {} in message {}".format( o['type'], o)) # if some bucket has records that need to be flushed but haven't reached batch size # then flush all buckets. if sum(row_count.values()) > 0: # flush all streams one last time, delete records if needed, reset counts and then emit current state flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest state emit_state(copy.deepcopy(flushed_state))
def persist_lines(config, lines, table_cache=None) -> None: state = None flushed_state = None schemas = {} key_properties = {} validators = {} records_to_load = {} row_count = {} stream_to_sync = {} total_row_count = {} batch_size_rows = config.get("batch_size_rows", DEFAULT_BATCH_SIZE_ROWS) last_log_file = '' log_files = 0 # Loop over lines from stdin for line in lines: try: o = json.loads(line) except json.decoder.JSONDecodeError: LOGGER.error("Unable to parse:\n{}".format(line)) raise if "type" not in o: raise Exception( "Line is missing required key 'type': {}".format(line)) t = o["type"] if t == "RECORD": if "stream" not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) if o["stream"] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(o["stream"])) # Get schema for this record's stream stream = o["stream"] # Validate record if config.get("validate_records"): try: validators[stream].validate(float_to_decimal(o["record"])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": raise InvalidValidationOperationException( f"Data validation failed and cannot load to destination. RECORD: {o['record']}\n" "multipleOf validations that allows long precisions are not supported (i.e. with 15 digits" "or more) Try removing 'multipleOf' methods from JSON schema." ) raise RecordValidationException( f"Record does not pass schema validation. RECORD: {o['record']}" ) primary_key_string = stream_to_sync[ stream].record_primary_key_string(o["record"]) if not primary_key_string: primary_key_string = "RID-{}".format(total_row_count[stream]) if stream not in records_to_load: records_to_load[stream] = {} # increment row count only when a new PK is encountered in the current batch if primary_key_string not in records_to_load[stream]: row_count[stream] += 1 total_row_count[stream] += 1 # append record if config.get("add_metadata_columns") or config.get("hard_delete"): records_to_load[stream][ primary_key_string] = add_metadata_values_to_record( o, stream_to_sync[stream]) else: records_to_load[stream][primary_key_string] = o["record"] # Either there have been 10 log files or many records to stream if row_count[stream] >= batch_size_rows or log_files >= 30: # flush all streams, delete records if needed, reset counts and then emit current state filter_streams = [stream] # Flush and return a new state dict with new positions only for the flushed streams LOGGER.info("FLUSHING ONE STREAM {}".format(stream)) flushed_state = flush_streams( records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams, ) log_files = 0 # emit last encountered state emit_state(copy.deepcopy(flushed_state)) elif t == "SCHEMA": if "stream" not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) stream = o["stream"] schemas[stream] = float_to_decimal(o["schema"]) validators[stream] = Draft7Validator( schemas[stream], format_checker=FormatChecker()) # flush records from previous stream SCHEMA # if same stream has been encountered again, it means the schema might have been altered # so previous records need to be flushed if row_count.get(stream, 0) > 0: LOGGER.info("FLUSHING SCHEMA CHANGE") flushed_state = flush_streams( records_to_load, row_count, stream_to_sync, config, state, flushed_state, ) # emit latest encountered state emit_state(flushed_state) # key_properties key must be available in the SCHEMA message. if "key_properties" not in o: raise Exception("key_properties field is required") # Log based and Incremental replications on tables with no Primary Key # cause duplicates when merging UPDATE events. # Stop loading data by default if no Primary Key. # # If you want to load tables with no Primary Key: # 1) Set ` 'primary_key_required': false ` in the target-redshift config.json # or # 2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.] if (config.get("primary_key_required", True) and len(o["key_properties"]) == 0): LOGGER.critical( "Primary key is set to mandatory but not defined in the [{}] stream" .format(stream)) raise Exception("key_properties field is required") key_properties[stream] = o["key_properties"] if config.get("add_metadata_columns") or config.get("hard_delete"): stream_to_sync[stream] = DbSync( config, add_metadata_columns_to_schema(o), table_cache) else: stream_to_sync[stream] = DbSync(config, o, table_cache) stream_to_sync[stream].create_schema_if_not_exists() stream_to_sync[stream].sync_table() row_count[stream] = 0 total_row_count[stream] = 0 elif t == "ACTIVATE_VERSION": LOGGER.debug("ACTIVATE_VERSION message") elif t == "STATE": LOGGER.debug("Setting state to {}".format(o["value"])) state = o["value"] if 'bookmarks' in state: log_file = list(state['bookmarks'].values())[0].get('log_file') if log_file != last_log_file: log_files += 1 if sum(row_count.values()) == 0 and log_files >= 30: log_files = 0 emit_state(state) LOGGER.info("LOG Rotated to {}".format(log_file)) last_log_file = log_file # Initially set flushed state if not flushed_state: flushed_state = copy.deepcopy(state) else: raise Exception("Unknown message type {} in message {}".format( o["type"], o)) # if some bucket has records that need to be flushed but haven't reached batch size # then flush all buckets. if sum(row_count.values()) > 0: # flush all streams one last time, delete records if needed, reset counts and then emit current state LOGGER.info("REACHED END OF BINLOG UPDATES") flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) else: LOGGER.info("NO RECORDS TO PERSIST") flushed_state = state # emit latest state emit_state(copy.deepcopy(flushed_state))