示例#1
0
def load_table_cache(config):
    table_cache = []
    if not ("disable_table_cache" in config and config["disable_table_cache"]):
        LOGGER.info("Getting catalog objects from table cache...")
        db = DbSync(config)
        table_cache = db.get_table_columns(
            filter_schemas=get_schema_names_from_config(config))

    return table_cache
def persist_lines(config, lines) -> None:
    state = None
    flushed_state = None
    schemas = {}
    key_properties = {}
    validators = {}
    records_to_load = {}
    csv_files_to_load = {}
    row_count = {}
    stream_to_sync = {}
    total_row_count = {}
    table_columns_cache = None
    batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)

    # Cache the available schemas, tables and columns from redshift if not disabled in config
    # The cache will be used later use to avoid lot of small queries hitting redshift
    if not ('disable_table_cache' in config
            and config['disable_table_cache'] == True):
        logger.info("Caching available catalog objects in redshift...")
        filter_schemas = get_schema_names_from_config(config)
        table_columns_cache = DbSync(config).get_table_columns(
            filter_schemas=filter_schemas)

    # Loop over lines from stdin
    for line in lines:
        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if 'type' not in o:
            raise Exception(
                "Line is missing required key 'type': {}".format(line))

        t = o['type']

        if t == 'RECORD':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(o['stream']))

            # Get schema for this record's stream
            stream = o['stream']

            # Validate record
            try:
                validators[stream].validate(float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations "
                        "that allows long precisions are not supported (i.e. with 15 digits or more). Try removing "
                        "'multipleOf' methods from JSON schema. ".format(
                            o['record']))
                    raise ex

            primary_key_string = stream_to_sync[
                stream].record_primary_key_string(o['record'])
            if not primary_key_string:
                primary_key_string = 'RID-{}'.format(total_row_count[stream])

            if stream not in records_to_load:
                records_to_load[stream] = {}

            # increment row count only when a new PK is encountered in the current batch
            if primary_key_string not in records_to_load[stream]:
                row_count[stream] += 1
                total_row_count[stream] += 1

            # append record
            if config.get('add_metadata_columns') or config.get('hard_delete'):
                records_to_load[stream][
                    primary_key_string] = add_metadata_values_to_record(
                        o, stream_to_sync[stream])
            else:
                records_to_load[stream][primary_key_string] = o['record']

            if row_count[stream] >= batch_size_rows:
                # flush all streams, delete records if needed, reset counts and then emit current state
                if config.get('flush_all_streams'):
                    filter_streams = None
                else:
                    filter_streams = [stream]

                # Flush and return a new state dict with new positions only for the flushed streams
                flushed_state = flush_streams(records_to_load,
                                              row_count,
                                              stream_to_sync,
                                              config,
                                              state,
                                              flushed_state,
                                              filter_streams=filter_streams)

                # emit last encountered state
                emit_state(copy.deepcopy(flushed_state))

        elif t == 'SCHEMA':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))

            stream = o['stream']

            schemas[stream] = o
            schema = float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())

            # flush records from previous stream SCHEMA
            # if same stream has been encountered again, it means the schema might have been altered
            # so previous records need to be flushed
            if row_count.get(stream, 0) > 0:
                flushed_state = flush_streams(records_to_load, row_count,
                                              stream_to_sync, config, state,
                                              flushed_state)

                # emit latest encountered state
                emit_state(flushed_state)

            # key_properties key must be available in the SCHEMA message.
            if 'key_properties' not in o:
                raise Exception("key_properties field is required")

            # Log based and Incremental replications on tables with no Primary Key
            # cause duplicates when merging UPDATE events.
            # Stop loading data by default if no Primary Key.
            #
            # If you want to load tables with no Primary Key:
            #  1) Set ` 'primary_key_required': false ` in the target-redshift config.json
            #  or
            #  2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.]
            if config.get('primary_key_required', True) and len(
                    o['key_properties']) == 0:
                logger.critical(
                    "Primary key is set to mandatory but not defined in the [{}] stream"
                    .format(stream))
                raise Exception("key_properties field is required")

            key_properties[stream] = o['key_properties']

            if config.get('add_metadata_columns') or config.get('hard_delete'):
                stream_to_sync[stream] = DbSync(
                    config, add_metadata_columns_to_schema(o))
            else:
                stream_to_sync[stream] = DbSync(config, o)

            stream_to_sync[stream].create_schema_if_not_exists(
                table_columns_cache)
            stream_to_sync[stream].sync_table(table_columns_cache)

            row_count[stream] = 0
            total_row_count[stream] = 0
            csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b')

        elif t == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')

        elif t == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']

            # Initially set flushed state
            if not flushed_state:
                flushed_state = copy.deepcopy(state)

        else:
            raise Exception("Unknown message type {} in message {}".format(
                o['type'], o))

    # if some bucket has records that need to be flushed but haven't reached batch size
    # then flush all buckets.
    if sum(row_count.values()) > 0:
        # flush all streams one last time, delete records if needed, reset counts and then emit current state
        flushed_state = flush_streams(records_to_load, row_count,
                                      stream_to_sync, config, state,
                                      flushed_state)

    # emit latest state
    emit_state(copy.deepcopy(flushed_state))
示例#3
0
def persist_lines(config, lines, table_cache=None) -> None:
    state = None
    flushed_state = None
    schemas = {}
    key_properties = {}
    validators = {}
    records_to_load = {}
    row_count = {}
    stream_to_sync = {}
    total_row_count = {}
    batch_size_rows = config.get("batch_size_rows", DEFAULT_BATCH_SIZE_ROWS)

    last_log_file = ''
    log_files = 0

    # Loop over lines from stdin

    for line in lines:
        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            LOGGER.error("Unable to parse:\n{}".format(line))
            raise

        if "type" not in o:
            raise Exception(
                "Line is missing required key 'type': {}".format(line))

        t = o["type"]

        if t == "RECORD":
            if "stream" not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            if o["stream"] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(o["stream"]))

            # Get schema for this record's stream
            stream = o["stream"]

            # Validate record
            if config.get("validate_records"):
                try:
                    validators[stream].validate(float_to_decimal(o["record"]))
                except Exception as ex:
                    if type(ex).__name__ == "InvalidOperation":
                        raise InvalidValidationOperationException(
                            f"Data validation failed and cannot load to destination. RECORD: {o['record']}\n"
                            "multipleOf validations that allows long precisions are not supported (i.e. with 15 digits"
                            "or more) Try removing 'multipleOf' methods from JSON schema."
                        )
                    raise RecordValidationException(
                        f"Record does not pass schema validation. RECORD: {o['record']}"
                    )

            primary_key_string = stream_to_sync[
                stream].record_primary_key_string(o["record"])
            if not primary_key_string:
                primary_key_string = "RID-{}".format(total_row_count[stream])

            if stream not in records_to_load:
                records_to_load[stream] = {}

            # increment row count only when a new PK is encountered in the current batch
            if primary_key_string not in records_to_load[stream]:
                row_count[stream] += 1
                total_row_count[stream] += 1

            # append record
            if config.get("add_metadata_columns") or config.get("hard_delete"):
                records_to_load[stream][
                    primary_key_string] = add_metadata_values_to_record(
                        o, stream_to_sync[stream])
            else:
                records_to_load[stream][primary_key_string] = o["record"]

            # Either there have been 10 log files or many records to stream
            if row_count[stream] >= batch_size_rows or log_files >= 30:
                # flush all streams, delete records if needed, reset counts and then emit current state
                filter_streams = [stream]

                # Flush and return a new state dict with new positions only for the flushed streams
                LOGGER.info("FLUSHING ONE STREAM {}".format(stream))

                flushed_state = flush_streams(
                    records_to_load,
                    row_count,
                    stream_to_sync,
                    config,
                    state,
                    flushed_state,
                    filter_streams=filter_streams,
                )

                log_files = 0
                # emit last encountered state
                emit_state(copy.deepcopy(flushed_state))

        elif t == "SCHEMA":
            if "stream" not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))

            stream = o["stream"]

            schemas[stream] = float_to_decimal(o["schema"])
            validators[stream] = Draft7Validator(
                schemas[stream], format_checker=FormatChecker())

            # flush records from previous stream SCHEMA
            # if same stream has been encountered again, it means the schema might have been altered
            # so previous records need to be flushed
            if row_count.get(stream, 0) > 0:
                LOGGER.info("FLUSHING SCHEMA CHANGE")
                flushed_state = flush_streams(
                    records_to_load,
                    row_count,
                    stream_to_sync,
                    config,
                    state,
                    flushed_state,
                )

                # emit latest encountered state
                emit_state(flushed_state)

            # key_properties key must be available in the SCHEMA message.
            if "key_properties" not in o:
                raise Exception("key_properties field is required")

            # Log based and Incremental replications on tables with no Primary Key
            # cause duplicates when merging UPDATE events.
            # Stop loading data by default if no Primary Key.
            #
            # If you want to load tables with no Primary Key:
            #  1) Set ` 'primary_key_required': false ` in the target-redshift config.json
            #  or
            #  2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.]
            if (config.get("primary_key_required", True)
                    and len(o["key_properties"]) == 0):
                LOGGER.critical(
                    "Primary key is set to mandatory but not defined in the [{}] stream"
                    .format(stream))
                raise Exception("key_properties field is required")

            key_properties[stream] = o["key_properties"]

            if config.get("add_metadata_columns") or config.get("hard_delete"):
                stream_to_sync[stream] = DbSync(
                    config, add_metadata_columns_to_schema(o), table_cache)
            else:
                stream_to_sync[stream] = DbSync(config, o, table_cache)

            stream_to_sync[stream].create_schema_if_not_exists()
            stream_to_sync[stream].sync_table()

            row_count[stream] = 0
            total_row_count[stream] = 0

        elif t == "ACTIVATE_VERSION":
            LOGGER.debug("ACTIVATE_VERSION message")

        elif t == "STATE":
            LOGGER.debug("Setting state to {}".format(o["value"]))

            state = o["value"]

            if 'bookmarks' in state:

                log_file = list(state['bookmarks'].values())[0].get('log_file')

                if log_file != last_log_file:
                    log_files += 1
                    if sum(row_count.values()) == 0 and log_files >= 30:
                        log_files = 0
                        emit_state(state)
                    LOGGER.info("LOG Rotated to {}".format(log_file))

                last_log_file = log_file

            # Initially set flushed state
            if not flushed_state:
                flushed_state = copy.deepcopy(state)

        else:
            raise Exception("Unknown message type {} in message {}".format(
                o["type"], o))

    # if some bucket has records that need to be flushed but haven't reached batch size
    # then flush all buckets.
    if sum(row_count.values()) > 0:
        # flush all streams one last time, delete records if needed, reset counts and then emit current state
        LOGGER.info("REACHED END OF BINLOG UPDATES")
        flushed_state = flush_streams(records_to_load, row_count,
                                      stream_to_sync, config, state,
                                      flushed_state)
    else:
        LOGGER.info("NO RECORDS TO PERSIST")
        flushed_state = state

    # emit latest state
    emit_state(copy.deepcopy(flushed_state))