def _commit(self): """Commit ongoing transaction, start a new one.""" self.transaction_manager.commit() self.transaction_manager.begin() self.cur = cursor() # Disable slow sequential scans. The database server is reluctant to # use indexes on tables that undergo large changes, such as the # deletion of large numbers of rows in this case. Usually it's # right but in this case it seems to slow things down dramatically and # unnecessarily. We disable sequential scans for every commit since # initZopeless by default resets our database connection with every # new transaction. # MultiTableCopy disables sequential scans for the first batch; this # just renews our setting after the connection is reset. postgresql.allow_sequential_scans(self.cur, False)
def pour(self, transaction_manager): """Pour data from holding tables back into source tables. Rows in the holding table that have their new_id set to null are skipped. The transaction manager is committed and re-opened after every batch run. Batch sizes are dynamically adjusted to meet the stated time goal. """ if self.last_extracted_table is None: if not self.needsRecovery(): raise AssertionError("Can't pour: no tables extracted") elif self.last_extracted_table != len(self.tables) - 1: raise AssertionError( "Not safe to pour: last table '%s' was not extracted" % self.tables[-1]) cur = self._commit(transaction_manager) # Don't let postgres revert to slow sequential scans while we pour. # That might otherwise happen to the holding table as its vital "id" # index degrades with the removal of rows. postgresql.allow_sequential_scans(cur, False) # Main loop: for each of the source tables being copied, see if # there's a matching holding table. If so, prepare it, pour it back # into the source table, and drop. for table in self.tables: holding_table_unquoted = self.getRawHoldingTableName(table) if not postgresql.have_table(cur, holding_table_unquoted): # We know we're in a suitable state for pouring. If this # table does not exist, it must be because it's been poured # out completely and dropped in an earlier instance of this # loop, before the failure we're apparently recovering from. continue holding_table = self.getHoldingTableName(table) self.logger.info("Pouring %s back into %s..." % (holding_table, table)) tablestarttime = time.time() has_new_id = postgresql.table_has_column( cur, holding_table_unquoted, 'new_id') self._pourTable( holding_table, table, has_new_id, transaction_manager) # Drop holding table. It may still contain rows with id set to # null. Those must not be poured. postgresql.drop_tables(cursor(), holding_table) self.logger.debug( "Pouring %s took %.3f seconds." % (holding_table, time.time() - tablestarttime)) cur = self._commit(transaction_manager) # In future, let the database perform sequential scans again if it # decides that's best. postgresql.allow_sequential_scans(cur, True)
def pour(self, transaction_manager): """Pour data from holding tables back into source tables. Rows in the holding table that have their new_id set to null are skipped. The transaction manager is committed and re-opened after every batch run. Batch sizes are dynamically adjusted to meet the stated time goal. """ if self.last_extracted_table is None: if not self.needsRecovery(): raise AssertionError("Can't pour: no tables extracted") elif self.last_extracted_table != len(self.tables) - 1: raise AssertionError( "Not safe to pour: last table '%s' was not extracted" % self.tables[-1]) cur = self._commit(transaction_manager) # Don't let postgres revert to slow sequential scans while we pour. # That might otherwise happen to the holding table as its vital "id" # index degrades with the removal of rows. postgresql.allow_sequential_scans(cur, False) # Main loop: for each of the source tables being copied, see if # there's a matching holding table. If so, prepare it, pour it back # into the source table, and drop. for table in self.tables: holding_table_unquoted = self.getRawHoldingTableName(table) if not postgresql.have_table(cur, holding_table_unquoted): # We know we're in a suitable state for pouring. If this # table does not exist, it must be because it's been poured # out completely and dropped in an earlier instance of this # loop, before the failure we're apparently recovering from. continue holding_table = self.getHoldingTableName(table) self.logger.info("Pouring %s back into %s..." % (holding_table, table)) tablestarttime = time.time() has_new_id = postgresql.table_has_column(cur, holding_table_unquoted, 'new_id') self._pourTable(holding_table, table, has_new_id, transaction_manager) # Drop holding table. It may still contain rows with id set to # null. Those must not be poured. postgresql.drop_tables(cursor(), holding_table) self.logger.debug("Pouring %s took %.3f seconds." % (holding_table, time.time() - tablestarttime)) cur = self._commit(transaction_manager) # In future, let the database perform sequential scans again if it # decides that's best. postgresql.allow_sequential_scans(cur, True)