예제 #1
0
    def run(self) -> bool:
        # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp)
        # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers,
        # which can be hard for example in the case of SQL Server
        logger.log(f'get highest modification comparison value',
                   format=logger.Format.ITALICS)
        max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}'
        logger.log(max_value_query, format=logger.Format.VERBATIM)
        result = shell.run_shell_command(
            f'echo {shlex.quote(max_value_query)} \\\n  | ' +
            mara_db.shell.copy_to_stdout_command(self.source_db_alias))

        if not result:
            return False

        # be flexible with different output formats: remove the column header & remove whitespace & quotes
        max_modification_value = ''.join(result).replace('maxval',
                                                         '').strip().strip('"')
        logger.log(repr(max_modification_value), format=logger.Format.VERBATIM)

        # check whether target table is empty
        target_table_is_empty = True

        target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1'
        logger.log(f'check if target table is empty',
                   format=logger.Format.ITALICS)
        logger.log(target_table_empty_query, format=logger.Format.VERBATIM)
        with mara_db.postgresql.postgres_cursor_context(
                self.target_db_alias) as cursor:
            cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1')
            target_table_is_empty = not cursor.fetchone()
            logger.log(
                f"target table{'' if target_table_is_empty else ' not'} empty",
                format=logger.Format.ITALICS)

        # get last comparison value
        logger.log('get last comparison value', format=logger.Format.ITALICS)
        last_comparison_value = incremental_copy_status.get_last_comparison_value(
            self.node_path(), self.source_db_alias, self.source_table)
        logger.log(repr(last_comparison_value), format=logger.Format.VERBATIM)

        if target_table_is_empty or not last_comparison_value:
            # full load
            logger.log('full (non incremental) copy', logger.Format.ITALICS)
            if not target_table_is_empty:
                truncate_query = f'TRUNCATE TABLE {self.target_table}'
                logger.log(truncate_query, format=logger.Format.VERBATIM)
                with mara_db.postgresql.postgres_cursor_context(
                        self.target_db_alias) as cursor:
                    cursor.execute(truncate_query)

            # overwrite the comparison criteria to get everything
            replace = {self.comparison_value_placeholder: '(1=1)'}
            complete_copy_command = self._copy_command(self.target_table,
                                                       replace)
            if not shell.run_shell_command(complete_copy_command):
                return False

        else:
            # incremental load. First create the table which will contain the delta
            logger.log('incremental copy, create upsert table',
                       logger.Format.ITALICS)
            create_upsert_table_query = (
                f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n' +
                f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE'
            )

            if not shell.run_shell_command(
                    f'echo {shlex.quote(create_upsert_table_query)} \\\n  | ' +
                    mara_db.shell.query_command(self.target_db_alias,
                                                echo_queries=True)):
                return False

            # perform the actual copy replacing the placeholder
            # with the comparison value from the latest successful execution
            replace = {
                self.comparison_value_placeholder:
                f'({self.modification_comparison} >= \'{last_comparison_value}\')'
            }
            if not shell.run_shell_command(
                    self._copy_command(self.target_table + '_upsert',
                                       replace)):
                return False

            # now the upsert table has to be merged with the target one

            # retrieve the target table columns to build the SET clause of the upsert query
            with mara_db.postgresql.postgres_cursor_context(
                    self.target_db_alias) as cursor:
                retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;"
                logger.log(retrieve_column_query,
                           format=logger.Format.VERBATIM)
                cursor.execute(retrieve_column_query)
                set_clause = ', '.join([
                    f'"{col[0]}" = EXCLUDED."{col[0]}"'
                    for col in cursor.fetchall()
                ])

            upsery_query = f"""
INSERT INTO {self.target_table}
SELECT {self.target_table}_upsert.*
FROM {self.target_table}_upsert
ON CONFLICT ({', '.join(['"' + primary_key +'"' for primary_key in self.primary_keys])})
DO UPDATE SET {set_clause}"""
            if not shell.run_shell_command(
                    f'echo {shlex.quote(upsery_query)} \\\n  | ' +
                    mara_db.shell.query_command(self.target_db_alias,
                                                echo_queries=True)):
                return False

        # update data_integration_incremental_copy_status
        incremental_copy_status.update(self.node_path(), self.source_db_alias,
                                       self.source_table,
                                       max_modification_value)
        return True
예제 #2
0
파일: sql.py 프로젝트: mara/mara-pipelines
    def run(self) -> bool:
        # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp)
        # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers,
        # which can be hard for example in the case of SQL Server
        logger.log(f'Get new max modification comparison value...', format=logger.Format.ITALICS)
        max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}'
        logger.log(max_value_query, format=logger.Format.VERBATIM)
        result = shell.run_shell_command(f'echo {shlex.quote(max_value_query)} \\\n  | '
                                         + mara_db.shell.copy_to_stdout_command(self.source_db_alias))

        if not result:
            return False

        if isinstance(result, bool):
            # This happens if the query above ran, but returned no data and therefore the load
            # query below would also return no data
            # We assume that this happens e.g. when there is no data *yet* and let the load succeed
            # without actually doing anything
            logger.log("Found no data, not starting Copy.", format=logger.Format.VERBATIM)
            return True
        # be flexible with different output formats: remove the column header & remove whitespace & quotes
        max_modification_value = ''.join(result).replace('maxval', '').strip().strip('"')
        logger.log(f"New max modification comparison value: {max_modification_value!r}", format=logger.Format.VERBATIM)

        # check whether target table is empty
        target_table_is_empty = True

        target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1'
        logger.log(f'Check if target table is empty', format=logger.Format.ITALICS)
        logger.log(target_table_empty_query, format=logger.Format.VERBATIM)
        with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor:
            cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1')
            target_table_is_empty = not cursor.fetchone()
            logger.log(f"target table{'' if target_table_is_empty else ' not'} empty", format=logger.Format.ITALICS)

        # get last comparison value
        logger.log('Get last comparison value...', format=logger.Format.ITALICS)
        last_comparison_value = incremental_copy_status.get_last_comparison_value(
            self.node_path(), self.source_db_alias, self.source_table)
        logger.log(f"Last max modification comparison value: {last_comparison_value!r}", format=logger.Format.VERBATIM)

        if target_table_is_empty or not last_comparison_value:
            # full load
            logger.log('Using full (non incremental) Copy', logger.Format.ITALICS)
            if not target_table_is_empty:
                truncate_query = f'TRUNCATE TABLE {self.target_table}'
                logger.log(truncate_query, format=logger.Format.VERBATIM)
                with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor:
                    cursor.execute(truncate_query)
            elif last_comparison_value:
                # table is empty but we have a last comparison value from earlier runs
                # If we would crash during load (with some data already in the table), the next run would
                # not trigger a full load and we would miss data. To prevent that, delete the old
                # comparison value (we will then set it only on success)
                logger.log('Deleting old comparison value', logger.Format.ITALICS)
                incremental_copy_status.delete(self.node_path(), self.source_db_alias, self.source_table)

            # overwrite the comparison criteria to get everything
            replace = {self.comparison_value_placeholder: '(1=1)'}
            complete_copy_command = self._copy_command(self.target_table, replace)
            if not shell.run_shell_command(complete_copy_command):
                return False

        else:
            # incremental load. First create the table which will contain the delta
            logger.log('Using incremental Copy, create upsert table', logger.Format.ITALICS)
            create_upsert_table_query = (f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n'
                                         + f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE')

            if not shell.run_shell_command(f'echo {shlex.quote(create_upsert_table_query)} \\\n  | '
                                           + mara_db.shell.query_command(self.target_db_alias)):
                return False

            # perform the actual copy replacing the placeholder
            # with the comparison value from the latest successful execution
            modification_comparison_type = self.modification_comparison_type or ''
            replace = {self.comparison_value_placeholder:
                           f'({self.modification_comparison} >= {modification_comparison_type} \'{last_comparison_value}\')'}
            if not shell.run_shell_command(self._copy_command(self.target_table + '_upsert', replace)):
                return False

            # now the upsert table has to be merged with the target one

            # retrieve the target table columns to build the SET clause of the upsert query
            with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor:
                retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;"
                logger.log(retrieve_column_query, format=logger.Format.VERBATIM)
                cursor.execute(retrieve_column_query)
                if self.use_explicit_upsert:
                    set_clause = ', '.join([f'"{col[0]}" = src."{col[0]}"' for col in cursor.fetchall()])
                    key_definition = ' AND '.join([f'dst."{k}" = src."{k}"' for k in self.primary_keys])
                else:
                    set_clause = ', '.join([f'"{col[0]}" = EXCLUDED."{col[0]}"' for col in cursor.fetchall()])
                    key_definition = ', '.join(['"' + primary_key + '"' for primary_key in self.primary_keys])

            if self.use_explicit_upsert:
                update_query = f"""
UPDATE {self.target_table} dst
SET {set_clause}
FROM {self.target_table}_upsert src
WHERE {key_definition}"""

                insert_query = f"""
INSERT INTO {self.target_table}
SELECT src.*
FROM {self.target_table}_upsert src
WHERE NOT EXISTS (SELECT 1 FROM {self.target_table} dst WHERE {key_definition})"""
                if not shell.run_shell_command(f'echo {shlex.quote(update_query)} \\\n  | '
                                               + mara_db.shell.query_command(self.target_db_alias)):
                    return False
                elif not shell.run_shell_command(f'echo {shlex.quote(insert_query)} \\\n  | '
                                                 + mara_db.shell.query_command(self.target_db_alias)):
                    return False
            else:
                upsery_query = f"""
INSERT INTO {self.target_table}
SELECT {self.target_table}_upsert.*
FROM {self.target_table}_upsert
ON CONFLICT ({key_definition})
DO UPDATE SET {set_clause}"""
                if not shell.run_shell_command(f'echo {shlex.quote(upsery_query)} \\\n  | '
                                               + mara_db.shell.query_command(self.target_db_alias)):
                    return False

        # update data_integration_incremental_copy_status
        incremental_copy_status.update(self.node_path(), self.source_db_alias,
                                       self.source_table, max_modification_value)
        return True