def run(self) -> bool: # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp) # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers, # which can be hard for example in the case of SQL Server logger.log(f'get highest modification comparison value', format=logger.Format.ITALICS) max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}' logger.log(max_value_query, format=logger.Format.VERBATIM) result = shell.run_shell_command( f'echo {shlex.quote(max_value_query)} \\\n | ' + mara_db.shell.copy_to_stdout_command(self.source_db_alias)) if not result: return False # be flexible with different output formats: remove the column header & remove whitespace & quotes max_modification_value = ''.join(result).replace('maxval', '').strip().strip('"') logger.log(repr(max_modification_value), format=logger.Format.VERBATIM) # check whether target table is empty target_table_is_empty = True target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1' logger.log(f'check if target table is empty', format=logger.Format.ITALICS) logger.log(target_table_empty_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1') target_table_is_empty = not cursor.fetchone() logger.log( f"target table{'' if target_table_is_empty else ' not'} empty", format=logger.Format.ITALICS) # get last comparison value logger.log('get last comparison value', format=logger.Format.ITALICS) last_comparison_value = incremental_copy_status.get_last_comparison_value( self.node_path(), self.source_db_alias, self.source_table) logger.log(repr(last_comparison_value), format=logger.Format.VERBATIM) if target_table_is_empty or not last_comparison_value: # full load logger.log('full (non incremental) copy', logger.Format.ITALICS) if not target_table_is_empty: truncate_query = f'TRUNCATE TABLE {self.target_table}' logger.log(truncate_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: cursor.execute(truncate_query) # overwrite the comparison criteria to get everything replace = {self.comparison_value_placeholder: '(1=1)'} complete_copy_command = self._copy_command(self.target_table, replace) if not shell.run_shell_command(complete_copy_command): return False else: # incremental load. First create the table which will contain the delta logger.log('incremental copy, create upsert table', logger.Format.ITALICS) create_upsert_table_query = ( f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n' + f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE' ) if not shell.run_shell_command( f'echo {shlex.quote(create_upsert_table_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias, echo_queries=True)): return False # perform the actual copy replacing the placeholder # with the comparison value from the latest successful execution replace = { self.comparison_value_placeholder: f'({self.modification_comparison} >= \'{last_comparison_value}\')' } if not shell.run_shell_command( self._copy_command(self.target_table + '_upsert', replace)): return False # now the upsert table has to be merged with the target one # retrieve the target table columns to build the SET clause of the upsert query with mara_db.postgresql.postgres_cursor_context( self.target_db_alias) as cursor: retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;" logger.log(retrieve_column_query, format=logger.Format.VERBATIM) cursor.execute(retrieve_column_query) set_clause = ', '.join([ f'"{col[0]}" = EXCLUDED."{col[0]}"' for col in cursor.fetchall() ]) upsery_query = f""" INSERT INTO {self.target_table} SELECT {self.target_table}_upsert.* FROM {self.target_table}_upsert ON CONFLICT ({', '.join(['"' + primary_key +'"' for primary_key in self.primary_keys])}) DO UPDATE SET {set_clause}""" if not shell.run_shell_command( f'echo {shlex.quote(upsery_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias, echo_queries=True)): return False # update data_integration_incremental_copy_status incremental_copy_status.update(self.node_path(), self.source_db_alias, self.source_table, max_modification_value) return True
def run(self) -> bool: # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp) # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers, # which can be hard for example in the case of SQL Server logger.log(f'Get new max modification comparison value...', format=logger.Format.ITALICS) max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}' logger.log(max_value_query, format=logger.Format.VERBATIM) result = shell.run_shell_command(f'echo {shlex.quote(max_value_query)} \\\n | ' + mara_db.shell.copy_to_stdout_command(self.source_db_alias)) if not result: return False if isinstance(result, bool): # This happens if the query above ran, but returned no data and therefore the load # query below would also return no data # We assume that this happens e.g. when there is no data *yet* and let the load succeed # without actually doing anything logger.log("Found no data, not starting Copy.", format=logger.Format.VERBATIM) return True # be flexible with different output formats: remove the column header & remove whitespace & quotes max_modification_value = ''.join(result).replace('maxval', '').strip().strip('"') logger.log(f"New max modification comparison value: {max_modification_value!r}", format=logger.Format.VERBATIM) # check whether target table is empty target_table_is_empty = True target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1' logger.log(f'Check if target table is empty', format=logger.Format.ITALICS) logger.log(target_table_empty_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor: cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1') target_table_is_empty = not cursor.fetchone() logger.log(f"target table{'' if target_table_is_empty else ' not'} empty", format=logger.Format.ITALICS) # get last comparison value logger.log('Get last comparison value...', format=logger.Format.ITALICS) last_comparison_value = incremental_copy_status.get_last_comparison_value( self.node_path(), self.source_db_alias, self.source_table) logger.log(f"Last max modification comparison value: {last_comparison_value!r}", format=logger.Format.VERBATIM) if target_table_is_empty or not last_comparison_value: # full load logger.log('Using full (non incremental) Copy', logger.Format.ITALICS) if not target_table_is_empty: truncate_query = f'TRUNCATE TABLE {self.target_table}' logger.log(truncate_query, format=logger.Format.VERBATIM) with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor: cursor.execute(truncate_query) elif last_comparison_value: # table is empty but we have a last comparison value from earlier runs # If we would crash during load (with some data already in the table), the next run would # not trigger a full load and we would miss data. To prevent that, delete the old # comparison value (we will then set it only on success) logger.log('Deleting old comparison value', logger.Format.ITALICS) incremental_copy_status.delete(self.node_path(), self.source_db_alias, self.source_table) # overwrite the comparison criteria to get everything replace = {self.comparison_value_placeholder: '(1=1)'} complete_copy_command = self._copy_command(self.target_table, replace) if not shell.run_shell_command(complete_copy_command): return False else: # incremental load. First create the table which will contain the delta logger.log('Using incremental Copy, create upsert table', logger.Format.ITALICS) create_upsert_table_query = (f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n' + f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE') if not shell.run_shell_command(f'echo {shlex.quote(create_upsert_table_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias)): return False # perform the actual copy replacing the placeholder # with the comparison value from the latest successful execution modification_comparison_type = self.modification_comparison_type or '' replace = {self.comparison_value_placeholder: f'({self.modification_comparison} >= {modification_comparison_type} \'{last_comparison_value}\')'} if not shell.run_shell_command(self._copy_command(self.target_table + '_upsert', replace)): return False # now the upsert table has to be merged with the target one # retrieve the target table columns to build the SET clause of the upsert query with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor: retrieve_column_query = f"SELECT attname FROM pg_attribute WHERE attrelid = '{self.target_table}'::REGCLASS AND attnum > 0;" logger.log(retrieve_column_query, format=logger.Format.VERBATIM) cursor.execute(retrieve_column_query) if self.use_explicit_upsert: set_clause = ', '.join([f'"{col[0]}" = src."{col[0]}"' for col in cursor.fetchall()]) key_definition = ' AND '.join([f'dst."{k}" = src."{k}"' for k in self.primary_keys]) else: set_clause = ', '.join([f'"{col[0]}" = EXCLUDED."{col[0]}"' for col in cursor.fetchall()]) key_definition = ', '.join(['"' + primary_key + '"' for primary_key in self.primary_keys]) if self.use_explicit_upsert: update_query = f""" UPDATE {self.target_table} dst SET {set_clause} FROM {self.target_table}_upsert src WHERE {key_definition}""" insert_query = f""" INSERT INTO {self.target_table} SELECT src.* FROM {self.target_table}_upsert src WHERE NOT EXISTS (SELECT 1 FROM {self.target_table} dst WHERE {key_definition})""" if not shell.run_shell_command(f'echo {shlex.quote(update_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias)): return False elif not shell.run_shell_command(f'echo {shlex.quote(insert_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias)): return False else: upsery_query = f""" INSERT INTO {self.target_table} SELECT {self.target_table}_upsert.* FROM {self.target_table}_upsert ON CONFLICT ({key_definition}) DO UPDATE SET {set_clause}""" if not shell.run_shell_command(f'echo {shlex.quote(upsery_query)} \\\n | ' + mara_db.shell.query_command(self.target_db_alias)): return False # update data_integration_incremental_copy_status incremental_copy_status.update(self.node_path(), self.source_db_alias, self.source_table, max_modification_value) return True