Пример #1
0
def downgrade(migrate_engine):
    meta.bind = migrate_engine

    keys = Enum(name='key', metadata=meta, *ZONE_ATTRIBUTE_KEYS)
    types = Enum(name='types', metadata=meta, *ZONE_TYPES)

    domains_attributes_table = Table('domain_attributes', meta, autoload=True)
    domains_table = Table('domains', meta, autoload=True)

    domains = select(columns=[domains_table.c.id, domains_table.c.type])\
        .where(domains_table.c.type == 'SECONDARY')\
        .execute().fetchall()

    for dom in domains:
        delete = domains_table.delete()\
            .where(domains_table.id == dom.id)
        delete.execute()

    domains_table.c.type.drop()
    domains_table.c.transferred_at.drop()

    domains_attributes_table.drop()
    keys.drop()
    types.drop()

    dialect = migrate_engine.url.get_dialect().name
    if dialect.startswith('sqlite'):
        constraint = UniqueConstraint(
            'name', 'deleted', name='unique_domain_name', table=domains_table)

        # Add missing unique index
        constraint.create()
def downgrade(migrate_engine):
    meta.bind = migrate_engine

    # Load the pool_attributes and pool_ns_records table schema
    pool_attributes_table = Table('pool_attributes', meta, autoload=True)
    pool_ns_records_table = Table('pool_ns_records', meta, autoload=True)

    # Find the nameservers for the default_pool_id
    pool_ns_records = select(
        columns=[
            pool_ns_records_table.c.id,
            pool_ns_records_table.c.created_at,
            pool_ns_records_table.c.updated_at,
            pool_ns_records_table.c.version,

            pool_ns_records_table.c.hostname,
        ]
    ).where(pool_attributes_table.c.pool_id == default_pool_id)\
     .execute().fetchall()

    # Create matching entries in the new table.
    for pool_ns_record in pool_ns_records:
        pool_attributes_table.insert().execute(
            id=pool_ns_record.id,
            created_at=pool_ns_record.created_at,
            updated_at=pool_ns_record.updated_at,
            version=pool_ns_record.version,

            key='name_server',
            value=pool_ns_record.hostname,
        )

    # Delete the pool_ns_records table from the DB
    pool_ns_records_table.drop()
Пример #3
0
def downgrade(migrate_engine):
    meta.bind = migrate_engine

    # Load the pool_attributes and pool_ns_records table schema
    pool_attributes_table = Table('pool_attributes', meta, autoload=True)
    pool_ns_records_table = Table('pool_ns_records', meta, autoload=True)

    # Find the nameservers for the default_pool_id
    pool_ns_records = select(
        columns=[
            pool_ns_records_table.c.id,
            pool_ns_records_table.c.created_at,
            pool_ns_records_table.c.updated_at,
            pool_ns_records_table.c.version,

            pool_ns_records_table.c.hostname,
        ]
    ).where(pool_attributes_table.c.pool_id == default_pool_id)\
     .execute().fetchall()

    # Create matching entries in the new table.
    for pool_ns_record in pool_ns_records:
        pool_attributes_table.insert().execute(
            id=pool_ns_record.id,
            created_at=pool_ns_record.created_at,
            updated_at=pool_ns_record.updated_at,
            version=pool_ns_record.version,
            key='name_server',
            value=pool_ns_record.hostname,
        )

    # Delete the pool_ns_records table from the DB
    pool_ns_records_table.drop()
def test_insert_table(engine_testaccount):
    metadata = MetaData()
    users = Table('users', metadata,
                  Column('id', Integer, Sequence('user_id_seq'),
                         primary_key=True),
                  Column('name', String),
                  Column('fullname', String),
                  )
    metadata.create_all(engine_testaccount)

    data = [{
        'id': 1,
        'name': 'testname1',
        'fullname': 'fulltestname1',
    }, {
        'id': 2,
        'name': 'testname2',
        'fullname': 'fulltestname2',
    }]
    conn = engine_testaccount.connect()
    try:
        # using multivalue insert
        conn.execute(users.insert(data))
        results = conn.execute(select([users]).order_by('id'))
        row = results.fetchone()
        assert row['name'] == 'testname1'

    finally:
        conn.close()
        users.drop(engine_testaccount)
Пример #5
0
def test_insert_table(engine_testaccount):
    metadata = MetaData()
    users = Table(
        'users',
        metadata,
        Column('id', Integer, Sequence('user_id_seq'), primary_key=True),
        Column('name', String),
        Column('fullname', String),
    )
    metadata.create_all(engine_testaccount)

    data = [{
        'id': 1,
        'name': 'testname1',
        'fullname': 'fulltestname1',
    }, {
        'id': 2,
        'name': 'testname2',
        'fullname': 'fulltestname2',
    }]
    conn = engine_testaccount.connect()
    try:
        # using multivalue insert
        conn.execute(users.insert(data))
        results = conn.execute(select([users]).order_by('id'))
        row = results.fetchone()
        assert row['name'] == 'testname1'

    finally:
        conn.close()
        users.drop(engine_testaccount)
Пример #6
0
    def drop_table(self, schema_name, table_name):
        metadata = MetaData()
        self.logger.debug(f"Dropping table {schema_name}.{table_name}")

        table = Table(table_name, metadata, schema=schema_name)
        table.drop(self.target_db, checkfirst=True)

        self.logger.debug(f"Dropped table {schema_name}.{table_name}")
def downgrade(migrate_engine):
    meta = MetaData()
    meta.bind = migrate_engine

    share_snapshots = Table('share_snapshots', meta, autoload=True)
    try:
        share_snapshots.drop()
    except Exception:
        LOG.error(_("share_snapshots table not dropped"))
        raise
Пример #8
0
    def test_insert_values(self, engine, connection):
        table = Table('insert_test', MetaData(bind=engine),
                      Column('a', sqlalchemy.types.Integer))
        table.drop(checkfirst=True)
        table.create()
        connection.execute(table.insert([{'a': 1}, {'a': 2}]))

        result = table.select().execute().fetchall()
        expected = [(1, ), (2, )]
        self.assertEqual(result, expected)
Пример #9
0
    def test_insert_select(self, engine, connection):
        one_row = Table('one_row', MetaData(bind=engine), autoload=True)
        table = Table('insert_test', MetaData(bind=engine),
                      Column('a', sqlalchemy.types.Integer))
        table.drop(checkfirst=True)
        table.create()
        connection.execute(table.insert().from_select(['a'], one_row.select()))

        result = table.select().execute().fetchall()
        expected = [(1, )]
        self.assertEqual(result, expected)
Пример #10
0
    def test_insert_values(self, engine, connection):
        table = Table('insert_test', MetaData(bind=engine),
                      Column('a', sqlalchemy.types.Integer),
                      schema='pyhive_test_database')
        table.drop(checkfirst=True)
        table.create()
        connection.execute(table.insert([{'a': 1}, {'a': 2}]))

        result = table.select().execute().fetchall()
        expected = [(1,), (2,)]
        self.assertEqual(result, expected)
Пример #11
0
def drop_table(engine, table_name):
    # Accept Connection objects here
    if hasattr(engine, 'engine'):
        engine = engine.engine
    if table_name in TABLES[engine]:
        table = TABLES[engine][table_name]
    elif engine.has_table(table_name):
        table = Table(table_name, engine._metadata)
    else:
        return
    table.drop(engine)
    TABLES[engine].pop(table_name, None)
Пример #12
0
    def test_insert_select(self, engine, connection):
        one_row = Table('one_row', MetaData(bind=engine), autoload=True)
        table = Table('insert_test', MetaData(bind=engine),
                      Column('a', sqlalchemy.types.Integer),
                      schema='pyhive_test_database')
        table.drop(checkfirst=True)
        table.create()
        connection.execute('SET mapred.job.tracker=local')
        # NOTE(jing) I'm stuck on a version of Hive without INSERT ... VALUES
        connection.execute(table.insert().from_select(['a'], one_row.select()))

        result = table.select().execute().fetchall()
        expected = [(1,)]
        self.assertEqual(result, expected)
Пример #13
0
def drop_table(engine, table_name):
    # Accept Connection objects here
    if hasattr(engine, 'engine'):
        engine = engine.engine

    with lock:
        if table_name in engine._tables:
            table = engine._tables[table_name]
        elif engine.has_table(table_name):
            table = Table(table_name, engine._metadata)
        else:
            return
        table.drop(engine)
        engine._tables.pop(table_name, None)
Пример #14
0
    def test_insert_select(self, engine, connection):
        one_row = Table('one_row', MetaData(bind=engine), autoload=True)
        table = Table('insert_test', MetaData(bind=engine),
                      Column('a', sqlalchemy.types.Integer),
                      schema='pyhive_test_database')
        table.drop(checkfirst=True)
        table.create()
        connection.execute('SET mapred.job.tracker=local')
        # NOTE(jing) I'm stuck on a version of Hive without INSERT ... VALUES
        connection.execute(table.insert().from_select(['a'], one_row.select()))

        result = table.select().execute().fetchall()
        expected = [(1,)]
        self.assertEqual(result, expected)
Пример #15
0
def drop_table(engine, table_name):
    # Accept Connection objects here
    if hasattr(engine, "engine"):
        engine = engine.engine

    with lock:
        if table_name in engine._tables:
            table = engine._tables[table_name]
        elif engine.has_table(table_name):
            table = Table(table_name, engine._metadata)
        else:
            return
        table.drop(engine)
        engine._tables.pop(table_name, None)
Пример #16
0
class TableHandler(object):

    """ Used by automatically generated objects such as datasets
    and dimensions to generate, write and clear the table under
    its management. """

    def _init_table(self, meta, namespace, name, id_type=Integer):
        """ Create the given table if it does not exist, otherwise
        reflect the current table schema from the database.
        """
        name = namespace + '__' + name
        self.table = Table(name, meta)
        if id_type is not None:
            col = Column('id', id_type, primary_key=True)
            self.table.append_column(col)

    def _generate_table(self):
        """ Create the given table if it does not exist. """
        # TODO: make this support some kind of migration?
        if not db.engine.has_table(self.table.name):
            self.table.create(db.engine)

    def _upsert(self, bind, data, unique_columns):
        """ Upsert a set of values into the table. This will
        query for the set of unique columns and either update an
        existing row or create a new one. In both cases, the ID
        of the changed row will be returned. """
        key = and_(*[self.table.c[c] == data.get(c)
                     for c in unique_columns])
        q = self.table.update(key, data)
        if bind.execute(q).rowcount == 0:
            q = self.table.insert(data)
            rs = bind.execute(q)
            return rs.inserted_primary_key[0]
        else:
            q = self.table.select(key)
            row = bind.execute(q).fetchone()
            return row['id']

    def _flush(self, bind):
        """ Delete all rows in the table. """
        q = self.table.delete()
        bind.execute(q)

    def _drop(self, bind):
        """ Drop the table and the local reference to it. """
        if db.engine.has_table(self.table.name):
            self.table.drop()
        del self.table
Пример #17
0
class TableHandler(object):
    """ Used by automatically generated objects such as datasets
    and dimensions to generate, write and clear the table under
    its management. """
    def _init_table(self, meta, namespace, name, id_type=Integer):
        """ Create the given table if it does not exist, otherwise
        reflect the current table schema from the database.
        """
        name = namespace + '__' + name
        self.table = Table(name, meta)
        if id_type is not None:
            col = Column('id', id_type, primary_key=True)
            self.table.append_column(col)

    def _generate_table(self):
        """ Create the given table if it does not exist. """
        # TODO: make this support some kind of migration?
        if not db.engine.has_table(self.table.name):
            self.table.create(db.engine)

    def _upsert(self, bind, data, unique_columns):
        """ Upsert a set of values into the table. This will
        query for the set of unique columns and either update an
        existing row or create a new one. In both cases, the ID
        of the changed row will be returned. """
        key = and_(*[self.table.c[c] == data.get(c) for c in unique_columns])
        q = self.table.update(key, data)
        if bind.execute(q).rowcount == 0:
            q = self.table.insert(data)
            rs = bind.execute(q)
            return rs.inserted_primary_key[0]
        else:
            q = self.table.select(key)
            row = bind.execute(q).fetchone()
            return row['id']

    def _flush(self, bind):
        """ Delete all rows in the table. """
        q = self.table.delete()
        bind.execute(q)

    def _drop(self, bind):
        """ Drop the table and the local reference to it. """
        if db.engine.has_table(self.table.name):
            self.table.drop()
        del self.table
 def test_lots_of_types(self, engine, connection):
     # take type list from sqlalchemy.types
     types = [
         'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT',
         'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB',
         'BOOLEAN', 'SMALLINT', 'DATE', 'TIME',
         'String', 'Integer', 'SmallInteger',
         'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'LargeBinary',
         'Boolean', 'Unicode', 'UnicodeText',
     ]
     cols = []
     for i, t in enumerate(types):
         cols.append(Column(str(i), getattr(sqlalchemy.types, t)))
     table = Table('test_table', MetaData(bind=engine), *cols)
     table.drop(checkfirst=True)
     table.create()
     table.drop()
Пример #19
0
 def test_lots_of_types(self, engine, connection):
     # Presto doesn't have raw CREATE TABLE support, so we ony test hive
     # take type list from sqlalchemy.types
     types = [
         'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT',
         'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB',
         'BOOLEAN', 'SMALLINT', 'DATE', 'TIME',
         'String', 'Integer', 'SmallInteger',
         'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'Binary',
         'Boolean', 'Unicode', 'UnicodeText',
     ]
     cols = []
     for i, t in enumerate(types):
         cols.append(Column(str(i), getattr(sqlalchemy.types, t)))
     cols.append(Column('hive_date', HiveDate))
     cols.append(Column('hive_decimal', HiveDecimal))
     cols.append(Column('hive_timestamp', HiveTimestamp))
     table = Table('test_table', MetaData(bind=engine), *cols, schema='pyhive_test_database')
     table.drop(checkfirst=True)
     table.create()
     connection.execute('SET mapred.job.tracker=local')
     connection.execute('USE pyhive_test_database')
     big_number = 10 ** 10 - 1
     connection.execute("""
     INSERT OVERWRITE TABLE test_table
     SELECT
         1, "a", "a", "a", "a", "a", 0.1,
         0.1, 0.1, 0, 0, "a", "a",
         false, 1, 0, 0,
         "a", 1, 1,
         0.1, 0.1, 0, 0, 0, "a",
         false, "a", "a",
         0, %d, 123 + 2000
     FROM default.one_row
     """, big_number)
     row = connection.execute(table.select()).fetchone()
     self.assertEqual(row.hive_date, datetime.date(1970, 1, 1))
     self.assertEqual(row.hive_decimal, decimal.Decimal(big_number))
     self.assertEqual(row.hive_timestamp, datetime.datetime(1970, 1, 1, 0, 0, 2, 123))
     table.drop()
Пример #20
0
 def test_lots_of_types(self, engine, connection):
     # Presto doesn't have raw CREATE TABLE support, so we ony test hive
     # take type list from sqlalchemy.types
     types = [
         'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT',
         'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB',
         'BOOLEAN', 'SMALLINT', 'DATE', 'TIME',
         'String', 'Integer', 'SmallInteger',
         'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'Binary',
         'Boolean', 'Unicode', 'UnicodeText',
     ]
     cols = []
     for i, t in enumerate(types):
         cols.append(Column(str(i), getattr(sqlalchemy.types, t)))
     cols.append(Column('hive_date', HiveDate))
     cols.append(Column('hive_decimal', HiveDecimal))
     cols.append(Column('hive_timestamp', HiveTimestamp))
     table = Table('test_table', MetaData(bind=engine), *cols, schema='pyhive_test_database')
     table.drop(checkfirst=True)
     table.create()
     connection.execute('SET mapred.job.tracker=local')
     connection.execute('USE pyhive_test_database')
     big_number = 10 ** 10 - 1
     connection.execute("""
     INSERT OVERWRITE TABLE test_table
     SELECT
         1, "a", "a", "a", "a", "a", 0.1,
         0.1, 0.1, 0, 0, "a", "a",
         false, 1, 0, 0,
         "a", 1, 1,
         0.1, 0.1, 0, 0, 0, "a",
         false, "a", "a",
         0, %d, 123 + 2000
     FROM default.one_row
     """, big_number)
     row = connection.execute(table.select()).fetchone()
     self.assertEqual(row.hive_date, datetime.date(1970, 1, 1))
     self.assertEqual(row.hive_decimal, decimal.Decimal(big_number))
     self.assertEqual(row.hive_timestamp, datetime.datetime(1970, 1, 1, 0, 0, 2, 123))
     table.drop()
Пример #21
0
    def create_table(self, schema_name, table_name, columns_configuration,
                     drop_first):
        metadata = MetaData()

        table = Table(table_name, metadata, schema=schema_name)

        for column_configuration in columns_configuration:
            table.append_column(
                self.create_column(column_configuration["destination"]))

        table.append_column(
            Column(
                Providers.AuditColumnsNames.TIMESTAMP,
                DateTime(timezone=True),
                server_default=func.now(),
            ))

        table.append_column(
            Column(
                Providers.AuditColumnsNames.IS_DELETED,
                Boolean,
                server_default="f",
                default=False,
            ))

        table.append_column(
            Column(Providers.AuditColumnsNames.CHANGE_VERSION, BigInteger))

        if drop_first:
            self.logger.debug(f"Dropping table {schema_name}.{table_name}")
            table.drop(self.target_db, checkfirst=True)
            self.logger.debug(f"Dropped table {schema_name}.{table_name}")

        self.logger.debug(f"Creating table {schema_name}.{table_name}")
        table.create(self.target_db, checkfirst=False)
        self.logger.debug(f"Created table {schema_name}.{table_name}")

        return
Пример #22
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine

    # Load the database tables
    servers_table = Table('servers', meta, autoload=True)
    servers_table.drop()
Пример #23
0
def downgrade(migrate_engine):
    meta.bind = migrate_engine

    # Find the table and drop it
    zone_tasks_table = Table("zone_tasks", meta, autoload=True)
    zone_tasks_table.drop()
Пример #24
0
def downgrade(migrate_engine):
    meta.bind = migrate_engine

    # Find the table and drop it
    zone_tasks_table = Table('zone_tasks', meta, autoload=True)
    zone_tasks_table.drop()
Пример #25
0
from sqlalchemy.schema import MetaData, Table, Column
from sqlalchemy.types import Unicode

engine = create_engine('postgresql://python@localhost/')

loader = PostgresLoader(engine)

testmodule = loader.load_module('testmodule')

metadata = MetaData(bind=engine)

table = Table('testtable', metadata,
        Column('test', Unicode),
        Column('test2', Unicode))

table.drop(checkfirst=True)
table.create(checkfirst=True)

for i in range(20):
    table.insert({'test': 'test%d' % i, 'test2': 'test%d' %i}).execute()

print(engine.execute(testmodule.pyconcat(table.c.test, table.c.test2)).fetchall())

statement = """
CREATE TRIGGER mytrigger
BEFORE INSERT
ON %s
FOR EACH ROW EXECUTE PROCEDURE %s();
"""

engine.execute(statement % (table.name, testmodule.nullifying_trigger.__name__))
Пример #26
0
def wipe_opt_out_patients(report_every: int = 1000,
                          chunksize: int = 10000) -> None:
    """
    Delete any data from patients that have opted out (after their data was
    processed on a previous occasion).
    (Slightly complicated by the fact that the destination database can't
    necessarily 'see' the mapping database, so we need to cache the RID keys in
    the destination database temporarily.)
    """
    start = "wipe_opt_out_patients"
    log.info(start)

    adminsession = config.admindb.session
    metadata = MetaData()  # operate in isolation!
    destengine = config.destdb.engine
    destsession = config.destdb.session
    ridfield = config.research_id_fieldname

    # Drop/create temporary table
    pkfield = 'rid'
    temptable = Table(
        config.temporary_tablename, metadata,
        Column(pkfield, config.SqlTypeEncryptedPid, primary_key=True),
        **TABLE_KWARGS)
    log.debug(start + ": 1. dropping temporary table")
    temptable.drop(destengine, checkfirst=True)  # use engine, not session
    log.debug(start + ": 2. making temporary table")
    temptable.create(destengine, checkfirst=True)  # use engine, not session

    log.debug(start + ": 3. populating temporary table with RIDs")

    def insert(records_):
        # records_: a list of dictionaries
        # http://docs.sqlalchemy.org/en/latest/core/tutorial.html
        log.debug(start + "... inserting {} records".format(len(records_)))
        destsession.execute(temptable.insert(), records_)

    i = 0
    records = []  # type: List[Dict[str: Any]]
    for rid in gen_optout_rids():
        i += 1
        if report_every and i % report_every == 0:
            log.debug(start + "... src row# {}".format(i))
        records.append({pkfield: rid})  # a row is a dict of values
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str: Any]]
    if records:  # remainder
        insert(records)
    commit_destdb()

    log.debug(start + ": 4. creating index on temporary table")
    index = Index('_temptable_idx', temptable.columns[pkfield])
    index.create(destengine)  # use engine, not session

    # 5. For each patient destination table,
    #    DELETE FROM desttable WHERE rid IN (SELECT rid FROM temptable)
    log.debug(start + ": 5. deleting from destination table by opt-out RID")
    for dest_table_name in config.dd.get_dest_tables_with_patient_info():
        log.debug(start + ": ... {}".format(dest_table_name))
        dest_table = config.dd.get_dest_sqla_table(dest_table_name)
        query = dest_table.delete().where(
            column(ridfield).in_(select([temptable.columns[pkfield]])))
        destengine.execute(query)
        commit_destdb()

    log.debug(start + ": 6. dropping temporary table")
    temptable.drop(destengine, checkfirst=True)  # use engine, not session
    commit_destdb()

    log.debug(start + ": 7. deleting opt-out patients from mapping table")
    adminsession.query(PatientInfo).filter(
        or_(PatientInfo.pid.in_(adminsession.query(OptOutPid.pid)),
            PatientInfo.mpid.in_(adminsession.query(
                OptOutMpid.mpid)))).delete(synchronize_session=False)
    commit_admindb()
Пример #27
0
def delete_dest_rows_with_no_src_row(
        srcdbname: str,
        src_table: str,
        report_every: int = DEFAULT_REPORT_EVERY,
        chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    For a given source database/table, delete any rows in the corresponding
    destination table where there is no corresponding source row.

    - Can't do this in a single SQL command, since the engine can't
      necessarily see both databases.
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - However, we can get stupidly long query lists if we try to SELECT all
      the values and use a DELETE FROM x WHERE y NOT IN (v1, v2, v3, ...)
      query. This crashes the MySQL connection, etc.
    - Therefore, we need a temporary table in the destination.
    """
    if not config.dd.has_active_destination(srcdbname, src_table):
        return
    dest_table_name = config.dd.get_dest_table_for_src_db_table(
        srcdbname, src_table)
    start = "delete_dest_rows_with_no_src_row: {}.{} -> {}.{}: ".format(
        srcdbname, src_table, config.destdb.name, dest_table_name)
    log.info(start + "[WARNING: MAY BE SLOW]")

    metadata = MetaData()  # operate in isolation!
    destengine = config.destdb.engine
    destsession = config.destdb.session
    dest_table = config.dd.get_dest_sqla_table(dest_table_name)
    pkddr = config.dd.get_pk_ddr(srcdbname, src_table)

    # If there's no source PK, we just delete everything
    if not pkddr:
        log.info("... No source PK; deleting everything")
        destsession.execute(dest_table.delete())
        commit_destdb()
        return

    if pkddr.addition_only:
        log.info("... Table marked as addition-only; not deleting anything")
        return

    # Drop/create temporary table
    pkfield = 'srcpk'
    temptable = Table(
        config.temporary_tablename, metadata,
        Column(pkfield, pkddr.get_dest_sqla_coltype(), primary_key=True),
        **TABLE_KWARGS)
    # THIS (ABOVE) IS WHAT CONSTRAINS A USER-DEFINED PK TO BE UNIQUE WITHIN ITS
    # TABLE.
    log.debug("... dropping temporary table")
    temptable.drop(destengine, checkfirst=True)
    log.debug("... making temporary table")
    temptable.create(destengine, checkfirst=True)

    # Populate temporary table, +/- PK translation
    n = count_star(config.sources[srcdbname].session, src_table)
    log.debug("... populating temporary table: {} records to go".format(n))

    def insert(records_):
        log.debug(start + "... inserting {} records".format(len(records_)))
        destsession.execute(temptable.insert(), records_)

    i = 0
    records = []  # type: List[Dict[str: Any]]
    for pk in gen_pks(srcdbname, src_table, pkddr.src_field):
        i += 1
        if report_every and i % report_every == 0:
            log.debug(start + "... src row# {} / {}".format(i, n))
        if pkddr.primary_pid:
            pk = config.encrypt_primary_pid(pk)
        elif pkddr.master_pid:
            pk = config.encrypt_master_pid(pk)
        records.append({pkfield: pk})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str: Any]]
    if records:  # remainder
        insert(records)
    commit_destdb()

    # 4. Index -- no, hang on, it's a primary key already
    #
    # log.debug("... creating index on temporary table")
    # index = Index('_temptable_idx', temptable.columns[pkfield])
    # index.create(destengine)

    # 5. DELETE FROM desttable
    #    WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.debug("... deleting from destination where appropriate")
    query = dest_table.delete().where(
        ~column(pkddr.dest_field).in_(select([temptable.columns[pkfield]])))
    destengine.execute(query)
    commit_destdb()

    # 6. Drop temporary table
    log.debug("... dropping temporary table")
    temptable.drop(destengine, checkfirst=True)

    # 7. Commit
    commit_destdb()
Пример #28
0
def upgrade(migrate_engine):
    meta.bind = migrate_engine

    # Load the database tables
    servers_table = Table('servers', meta, autoload=True)
    servers_table.drop()
Пример #29
0
class Table(object):
    def __init__(self, db, schema, table, columns=None):
        self.db = db
        self.schema = schema
        self.name = table
        self.engine = create_engine(db.url)
        self.metadata = MetaData(schema=schema)
        self.metadata.bind = self.engine
        # http://docs.sqlalchemy.org/en/rel_1_0/core/metadata.html
        # if provided columns (SQLAlchemy columns), create the table
        if table:
            if columns:
                self.table = SQLATable(
                    table, self.metadata, schema=self.schema, *columns
                )
                self.table.create()
            # otherwise just load from db
            else:
                self.table = SQLATable(
                    table, self.metadata, schema=self.schema, autoload=True
                )
            self.indexes = dict((i.name, i) for i in self.table.indexes)
            self._is_dropped = False
        else:
            self._is_dropped = True
            self.table = None

    @property
    def _normalized_columns(self):
        return list(map(normalize_column_name, self.columns))

    @property
    def columns(self):
        """Return list of all columns in table
        """
        return list(self.table.columns.keys())

    @property
    def sqla_columns(self):
        """Return all columns in table as sqlalchemy column types
        """
        return self.table.columns

    @property
    def column_types(self):
        """Return a dict mapping column name to type for all columns in table
        """
        column_types = {}
        for c in self.sqla_columns:
            column_types[c.name] = c.type
        return column_types

    @property
    def primary_key(self):
        """Return a list of columns making up the primary key constraint
        """
        return [c.name for c in self.table.primary_key]

    @property
    def op(self):
        ctx = MigrationContext.configure(self.engine.connect())
        return Operations(ctx)

    def _valid_table_name(self, table_name):
        """Check if the table name is obviously invalid.
        """
        if table_name is None or not len(table_name.strip()):
            raise ValueError("Invalid table name: %r" % table_name)
        return table_name.strip()

    def _update_table(self, table_name):
        self.metadata = MetaData(schema=self.schema)
        self.metadata.bind = self.engine
        return SQLATable(table_name, self.metadata, schema=self.schema)

    def add_primary_key(self, column="id"):
        """Add primary key constraint to specified column
        """
        if not self.primary_key:
            sql = """ALTER TABLE {s}.{t}
                     ADD PRIMARY KEY ({c})
                  """.format(
                s=self.schema, t=self.name, c=column
            )
            self.db.execute(sql)

    def drop(self):
        """Drop the table from the database
        """
        if self._is_dropped is False:
            self.table.drop(self.engine)
        self._is_dropped = True

    def _check_dropped(self):
        if self._is_dropped:
            raise DatasetException(
                "the table has been dropped. this object should not be used again."
            )

    def _args_to_clause(self, args):
        clauses = []
        for k, v in args.items():
            if isinstance(v, (list, tuple)):
                clauses.append(self.table.c[k].in_(v))
            else:
                clauses.append(self.table.c[k] == v)
        return and_(*clauses)

    def create_column(self, name, type):
        """
        Explicitely create a new column ``name`` of a specified type.
        ``type`` must be a `SQLAlchemy column type <http://docs.sqlalchemy.org/en/rel_0_8/core/types.html>`_.
        ::

            table.create_column('created_at', sqlalchemy.DateTime)
        """
        self._check_dropped()
        if normalize_column_name(name) not in self._normalized_columns:
            self.op.add_column(self.table.name, Column(name, type), self.table.schema)
            self.table = self._update_table(self.table.name)

    def drop_column(self, name):
        """
        Drop the column ``name``
        ::

            table.drop_column('created_at')
        """
        self._check_dropped()
        if name in list(self.table.columns.keys()):
            self.op.drop_column(self.table.name, name, schema=self.schema)
            self.table = self._update_table(self.table.name)

    def create_index(self, columns, name=None, index_type="btree"):
        """
        Create an index to speed up queries on a table.
        If no ``name`` is given a random name is created.
        ::
            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        if not name:
            sig = "||".join(columns + [index_type])
            # This is a work-around for a bug in <=0.6.1 which would create
            # indexes based on hash() rather than a proper hash.
            key = abs(hash(sig))
            name = "ix_%s_%s" % (self.table.name, key)
            if name in self.indexes:
                return self.indexes[name]
            key = sha1(sig.encode("utf-8")).hexdigest()[:16]
            name = "ix_%s_%s" % (self.table.name, key)
        if name in self.indexes:
            return self.indexes[name]
        # self.db._acquire()
        columns = [self.table.c[col] for col in columns]
        idx = Index(name, *columns, postgresql_using=index_type)
        idx.create(self.engine)
        # finally:
        #    self.db._release()
        self.indexes[name] = idx
        return idx

    def create_index_geom(self, column="geom"):
        """Shortcut to create index on geometry
        """
        self.create_index([column], index_type="gist")

    def distinct(self, *columns, **_filter):
        """
        Returns all rows of a table, but removes rows in with duplicate values in ``columns``.
        Interally this creates a `DISTINCT statement <http://www.w3schools.com/sql/sql_distinct.asp>`_.
        ::

            # returns only one row per year, ignoring the rest
            table.distinct('year')
            # works with multiple columns, too
            table.distinct('year', 'country')
            # you can also combine this with a filter
            table.distinct('year', country='China')
        """
        self._check_dropped()
        qargs = []
        try:
            columns = [self.table.c[c] for c in columns]
            for col, val in _filter.items():
                qargs.append(self.table.c[col] == val)
        except KeyError:
            return []

        q = expression.select(
            columns,
            distinct=True,
            whereclause=and_(*qargs),
            order_by=[c.asc() for c in columns],
        )
        # if just looking at one column, return a simple list
        if len(columns) == 1:
            return itertools.chain.from_iterable(self.engine.execute(q))
        # otherwise return specified row_type
        else:
            return ResultIter(self.engine.execute(q), row_type=self.db.row_type)

    def insert(self, row):
        """
        Add a row (type: dict) by inserting it into the table.
        Columns must exist.
        ::
            data = dict(title='I am a banana!')
            table.insert(data)
        Returns the inserted row's primary key.
        """
        self._check_dropped()
        res = self.engine.execute(self.table.insert(row))
        if len(res.inserted_primary_key) > 0:
            return res.inserted_primary_key[0]

    def insert_many(self, rows, chunk_size=1000):
        """
        Add many rows at a time, which is significantly faster than adding
        them one by one. Per default the rows are processed in chunks of
        1000 per commit, unless you specify a different ``chunk_size``.
        See :py:meth:`insert() <dataset.Table.insert>` for details on
        the other parameters.
        ::
            rows = [dict(name='Dolly')] * 10000
            table.insert_many(rows)
        """

        def _process_chunk(chunk):
            self.table.insert().execute(chunk)

        self._check_dropped()

        chunk = []
        for i, row in enumerate(rows, start=1):
            chunk.append(row)
            if i % chunk_size == 0:
                _process_chunk(chunk)
                chunk = []
        if chunk:
            _process_chunk(chunk)

    def rename(self, name):
        """Rename the table
        """
        sql = """ALTER TABLE {s}.{t} RENAME TO {name}
              """.format(
            s=self.schema, t=self.name, name=name
        )
        self.engine.execute(sql)
        self.table = SQLATable(name, self.metadata, schema=self.schema, autoload=True)

    def find_one(self, **kwargs):
        """
        Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or None.
        ::
            row = table.find_one(country='United States')
        """
        kwargs["_limit"] = 1
        iterator = self.find(**kwargs)
        try:
            return next(iterator)
        except StopIteration:
            return None

    def _args_to_order_by(self, order_by):
        if order_by[0] == "-":
            return self.table.c[order_by[1:]].desc()
        else:
            return self.table.c[order_by].asc()

    def find(
        self,
        _limit=None,
        _offset=0,
        _step=5000,
        order_by="id",
        return_count=False,
        **_filter
    ):
        """
        Performs a simple search on the table. Simply pass keyword arguments as ``filter``.
        ::
            results = table.find(country='France')
            results = table.find(country='France', year=1980)
        Using ``_limit``::
            # just return the first 10 rows
            results = table.find(country='France', _limit=10)
        You can sort the results by single or multiple columns. Append a minus sign
        to the column name for descending order::
            # sort results by a column 'year'
            results = table.find(country='France', order_by='year')
            # return all rows sorted by multiple columns (by year in descending order)
            results = table.find(order_by=['country', '-year'])
        By default :py:meth:`find() <dataset.Table.find>` will break the
        query into chunks of ``_step`` rows to prevent huge tables
        from being loaded into memory at once.
        For more complex queries, please use :py:meth:`db.query()`
        instead."""
        self._check_dropped()
        if not isinstance(order_by, (list, tuple)):
            order_by = [order_by]
        order_by = [
            o
            for o in order_by
            if (o.startswith("-") and o[1:] or o) in self.table.columns
        ]
        order_by = [self._args_to_order_by(o) for o in order_by]

        args = self._args_to_clause(_filter)

        # query total number of rows first
        count_query = alias(
            self.table.select(whereclause=args, limit=_limit, offset=_offset),
            name="count_query_alias",
        ).count()
        rp = self.engine.execute(count_query)
        total_row_count = rp.fetchone()[0]
        if return_count:
            return total_row_count

        if _limit is None:
            _limit = total_row_count

        if _step is None or _step is False or _step == 0:
            _step = total_row_count

        if total_row_count > _step and not order_by:
            _step = total_row_count
            log.warn(
                "query cannot be broken into smaller sections because it is unordered"
            )

        queries = []

        for i in count():
            qoffset = _offset + (_step * i)
            qlimit = min(_limit - (_step * i), _step)
            if qlimit <= 0:
                break
            queries.append(
                self.table.select(
                    whereclause=args, limit=qlimit, offset=qoffset, order_by=order_by
                )
            )
        return ResultIter(
            (self.engine.execute(q) for q in queries), row_type=self.db.row_type
        )

    def count(self, **_filter):
        """
        Return the count of results for the given filter set
        (same filter options as with ``find()``).
        """
        return self.find(return_count=True, **_filter)

    def __getitem__(self, item):
        """
        This is an alias for distinct which allows the table to be queried as using
        square bracket syntax.
        ::
            # Same as distinct:
            print list(table['year'])
        """
        if not isinstance(item, tuple):
            item = (item,)
        return self.distinct(*item)

    def all(self):
        """
        Returns all rows of the table as simple dictionaries. This is simply a shortcut
        to *find()* called with no arguments.
        ::
            rows = table.all()"""
        return self.find()

    def __iter__(self):
        """
        Allows for iterating over all rows in the table without explicetly
        calling :py:meth:`all() <dataset.Table.all>`.
        ::
            for row in table:
                print(row)
        """
        return self.all()

    def __repr__(self):
        return "<Table(%s)>" % self.table.name
Пример #30
0
def drop_table(eng, tbl_name):
    table = Table(tbl_name, MetaData(), autoload_with=eng)
    if table.exists():
        table.drop()
        return True
    return False
Пример #31
0
 def delete_table(self, table_name):
     table = Table(table_name, self.meta)
     table.drop()
Пример #32
0
class Table(object):
    def __init__(self, db, schema, table, columns=None):
        self.db = db
        self.schema = schema
        self.name = table
        self.engine = create_engine(db.url)
        self.metadata = MetaData(schema=schema)
        self.metadata.bind = self.engine
        # http://docs.sqlalchemy.org/en/rel_1_0/core/metadata.html
        # if provided columns (SQLAlchemy columns), create the table
        if table:
            if columns:
                self.table = SQLATable(table,
                                       self.metadata,
                                       schema=self.schema,
                                       *columns)
                self.table.create()
            # otherwise just load from db
            else:
                self.table = SQLATable(table,
                                       self.metadata,
                                       schema=self.schema,
                                       autoload=True)
            self.indexes = dict((i.name, i) for i in self.table.indexes)
            self._is_dropped = False
        else:
            self._is_dropped = True
            self.table = None

    @property
    def _normalized_columns(self):
        return list(map(normalize_column_name, self.columns))

    @property
    def columns(self):
        """Return list of all columns in table
        """
        return list(self.table.columns.keys())

    @property
    def sqla_columns(self):
        """Return all columns in table as sqlalchemy column types
        """
        return self.table.columns

    @property
    def column_types(self):
        """Return a dict mapping column name to type for all columns in table
        """
        column_types = {}
        for c in self.sqla_columns:
            column_types[c.name] = c.type
        return column_types

    @property
    def primary_key(self):
        """Return a list of columns making up the primary key constraint
        """
        return [c.name for c in self.table.primary_key]

    @property
    def op(self):
        ctx = MigrationContext.configure(self.engine.connect())
        return Operations(ctx)

    def _valid_table_name(self, table_name):
        """Check if the table name is obviously invalid.
        """
        if table_name is None or not len(table_name.strip()):
            raise ValueError("Invalid table name: %r" % table_name)
        return table_name.strip()

    def _update_table(self, table_name):
        self.metadata = MetaData(schema=self.schema)
        self.metadata.bind = self.engine
        return SQLATable(table_name, self.metadata, schema=self.schema)

    def add_primary_key(self, column="id"):
        """Add primary key constraint to specified column
        """
        if not self.primary_key:
            sql = """ALTER TABLE {s}.{t}
                     ADD PRIMARY KEY ({c})
                  """.format(s=self.schema, t=self.name, c=column)
            self.db.execute(sql)

    def drop(self):
        """Drop the table from the database
        """
        if self._is_dropped is False:
            self.table.drop(self.engine)
        self._is_dropped = True

    def _check_dropped(self):
        if self._is_dropped:
            raise DatasetException(
                "the table has been dropped. this object should not be used again."
            )

    def _args_to_clause(self, args):
        clauses = []
        for k, v in args.items():
            if isinstance(v, (list, tuple)):
                clauses.append(self.table.c[k].in_(v))
            else:
                clauses.append(self.table.c[k] == v)
        return and_(*clauses)

    def create_column(self, name, type):
        """
        Explicitely create a new column ``name`` of a specified type.
        ``type`` must be a `SQLAlchemy column type <http://docs.sqlalchemy.org/en/rel_0_8/core/types.html>`_.
        ::

            table.create_column('created_at', sqlalchemy.DateTime)
        """
        self._check_dropped()
        if normalize_column_name(name) not in self._normalized_columns:
            self.op.add_column(self.table.name, Column(name, type),
                               self.table.schema)
            self.table = self._update_table(self.table.name)

    def drop_column(self, name):
        """
        Drop the column ``name``
        ::

            table.drop_column('created_at')
        """
        self._check_dropped()
        if name in list(self.table.columns.keys()):
            self.op.drop_column(self.table.name, name, schema=self.schema)
            self.table = self._update_table(self.table.name)

    def create_index(self, columns, name=None, index_type="btree"):
        """
        Create an index to speed up queries on a table.
        If no ``name`` is given a random name is created.
        ::
            table.create_index(['name', 'country'])
        """
        self._check_dropped()
        if not name:
            sig = "||".join(columns + [index_type])
            # This is a work-around for a bug in <=0.6.1 which would create
            # indexes based on hash() rather than a proper hash.
            key = abs(hash(sig))
            name = "ix_%s_%s" % (self.table.name, key)
            if name in self.indexes:
                return self.indexes[name]
            key = sha1(sig.encode("utf-8")).hexdigest()[:16]
            name = "ix_%s_%s" % (self.table.name, key)
        if name in self.indexes:
            return self.indexes[name]
        # self.db._acquire()
        columns = [self.table.c[col] for col in columns]
        idx = Index(name, *columns, postgresql_using=index_type)
        idx.create(self.engine)
        # finally:
        #    self.db._release()
        self.indexes[name] = idx
        return idx

    def create_index_geom(self, column="geom"):
        """Shortcut to create index on geometry
        """
        self.create_index([column], index_type="gist")

    def distinct(self, *columns, **_filter):
        """
        Returns all rows of a table, but removes rows in with duplicate values in ``columns``.
        Interally this creates a `DISTINCT statement <http://www.w3schools.com/sql/sql_distinct.asp>`_.
        ::

            # returns only one row per year, ignoring the rest
            table.distinct('year')
            # works with multiple columns, too
            table.distinct('year', 'country')
            # you can also combine this with a filter
            table.distinct('year', country='China')
        """
        self._check_dropped()
        qargs = []
        try:
            columns = [self.table.c[c] for c in columns]
            for col, val in _filter.items():
                qargs.append(self.table.c[col] == val)
        except KeyError:
            return []

        q = expression.select(
            columns,
            distinct=True,
            whereclause=and_(*qargs),
            order_by=[c.asc() for c in columns],
        )
        # if just looking at one column, return a simple list
        if len(columns) == 1:
            return itertools.chain.from_iterable(self.engine.execute(q))
        # otherwise return specified row_type
        else:
            return ResultIter(self.engine.execute(q),
                              row_type=self.db.row_type)

    def insert(self, row):
        """
        Add a row (type: dict) by inserting it into the table.
        Columns must exist.
        ::
            data = dict(title='I am a banana!')
            table.insert(data)
        Returns the inserted row's primary key.
        """
        self._check_dropped()
        res = self.engine.execute(self.table.insert(row))
        if len(res.inserted_primary_key) > 0:
            return res.inserted_primary_key[0]

    def insert_many(self, rows, chunk_size=1000):
        """
        Add many rows at a time, which is significantly faster than adding
        them one by one. Per default the rows are processed in chunks of
        1000 per commit, unless you specify a different ``chunk_size``.
        See :py:meth:`insert() <dataset.Table.insert>` for details on
        the other parameters.
        ::
            rows = [dict(name='Dolly')] * 10000
            table.insert_many(rows)
        """
        def _process_chunk(chunk):
            self.table.insert().execute(chunk)

        self._check_dropped()

        chunk = []
        for i, row in enumerate(rows, start=1):
            chunk.append(row)
            if i % chunk_size == 0:
                _process_chunk(chunk)
                chunk = []
        if chunk:
            _process_chunk(chunk)

    def rename(self, name):
        """Rename the table
        """
        sql = """ALTER TABLE {s}.{t} RENAME TO {name}
              """.format(s=self.schema, t=self.name, name=name)
        self.engine.execute(sql)
        self.table = SQLATable(name,
                               self.metadata,
                               schema=self.schema,
                               autoload=True)

    def find_one(self, **kwargs):
        """
        Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or None.
        ::
            row = table.find_one(country='United States')
        """
        kwargs["_limit"] = 1
        iterator = self.find(**kwargs)
        try:
            return next(iterator)
        except StopIteration:
            return None

    def _args_to_order_by(self, order_by):
        if order_by[0] == "-":
            return self.table.c[order_by[1:]].desc()
        else:
            return self.table.c[order_by].asc()

    def find(self,
             _limit=None,
             _offset=0,
             _step=5000,
             order_by="id",
             return_count=False,
             **_filter):
        """
        Performs a simple search on the table. Simply pass keyword arguments as ``filter``.
        ::
            results = table.find(country='France')
            results = table.find(country='France', year=1980)
        Using ``_limit``::
            # just return the first 10 rows
            results = table.find(country='France', _limit=10)
        You can sort the results by single or multiple columns. Append a minus sign
        to the column name for descending order::
            # sort results by a column 'year'
            results = table.find(country='France', order_by='year')
            # return all rows sorted by multiple columns (by year in descending order)
            results = table.find(order_by=['country', '-year'])
        By default :py:meth:`find() <dataset.Table.find>` will break the
        query into chunks of ``_step`` rows to prevent huge tables
        from being loaded into memory at once.
        For more complex queries, please use :py:meth:`db.query()`
        instead."""
        self._check_dropped()
        if not isinstance(order_by, (list, tuple)):
            order_by = [order_by]
        order_by = [
            o for o in order_by
            if (o.startswith("-") and o[1:] or o) in self.table.columns
        ]
        order_by = [self._args_to_order_by(o) for o in order_by]

        args = self._args_to_clause(_filter)

        # query total number of rows first
        count_query = alias(
            self.table.select(whereclause=args, limit=_limit, offset=_offset),
            name="count_query_alias",
        ).count()
        rp = self.engine.execute(count_query)
        total_row_count = rp.fetchone()[0]
        if return_count:
            return total_row_count

        if _limit is None:
            _limit = total_row_count

        if _step is None or _step is False or _step == 0:
            _step = total_row_count

        if total_row_count > _step and not order_by:
            _step = total_row_count
            log.warn(
                "query cannot be broken into smaller sections because it is unordered"
            )

        queries = []

        for i in count():
            qoffset = _offset + (_step * i)
            qlimit = min(_limit - (_step * i), _step)
            if qlimit <= 0:
                break
            queries.append(
                self.table.select(whereclause=args,
                                  limit=qlimit,
                                  offset=qoffset,
                                  order_by=order_by))
        return ResultIter((self.engine.execute(q) for q in queries),
                          row_type=self.db.row_type)

    def count(self, **_filter):
        """
        Return the count of results for the given filter set
        (same filter options as with ``find()``).
        """
        return self.find(return_count=True, **_filter)

    def __getitem__(self, item):
        """
        This is an alias for distinct which allows the table to be queried as using
        square bracket syntax.
        ::
            # Same as distinct:
            print list(table['year'])
        """
        if not isinstance(item, tuple):
            item = (item, )
        return self.distinct(*item)

    def all(self):
        """
        Returns all rows of the table as simple dictionaries. This is simply a shortcut
        to *find()* called with no arguments.
        ::
            rows = table.all()"""
        return self.find()

    def __iter__(self):
        """
        Allows for iterating over all rows in the table without explicetly
        calling :py:meth:`all() <dataset.Table.all>`.
        ::
            for row in table:
                print(row)
        """
        return self.all()

    def __repr__(self):
        return "<Table(%s)>" % self.table.name
Пример #33
0
def delete_where_no_source(nlpdef: NlpDefinition,
                           ifconfig: InputFieldConfig,
                           report_every: int = DEFAULT_REPORT_EVERY,
                           chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    Delete destination records where source records no longer exist.

    - Can't do this in a single SQL command, since the engine can't necessarily
      see both databases.
    - Can't use a single temporary table, since the progress database isn't
      necessarily the same as any of the destination database(s).
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - So we fetch all source PKs (which, by definition, do exist), stash them
      keep them in memory, and do a DELETE WHERE NOT IN based on those
      specified values (or, if there are no PKs in the source, delete
      everything from the destination).

    Problems:
    - This is IMPERFECT if we have string source PKs and there are hash
      collisions (e.g. PKs for records X and Y both hash to the same thing;
      record X is deleted; then its processed version might not be).
    - With massive tables, we might run out of memory or (much more likely)
      SQL parameter slots. -- This is now happening; error looks like:
      pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but
      2717783 parameters were supplied', 'HY000')

    A better way might be:
    - for each table, make a temporary table in the same database
    - populate that table with (source PK integer/hash, source PK string) pairs
    - delete where pairs don't match -- is that portable SQL?
      http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns  # noqa
    - More efficient would be to make one table per destination database.

    On the "delete where multiple fields don't match":
    - Single field syntax is
        DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b)
    - Multiple field syntax is
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND a.a2 = b.b2
        )
    - In SQLAlchemy, exists():
        http://stackoverflow.com/questions/14600619
        http://docs.sqlalchemy.org/en/latest/core/selectable.html
    - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false,
      so we have to do an explicit null check.
      You do that with "field == None" (disable
      See http://stackoverflow.com/questions/21668606
      We're aiming, therefore, for:
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND (
                a.a2 = b.b2
                OR (a.a2 IS NULL AND b.b2 IS NULL)
            )
        )
    """

    # -------------------------------------------------------------------------
    # Sub-functions
    # -------------------------------------------------------------------------

    def insert(records_):
        n_rows = len(records_)
        log.debug("... inserting {} records".format(n_rows))
        for db in databases:
            session_ = db['session']
            temptable_ = db['temptable']  # type: Table
            session_.execute(temptable_.insert(), records_)
            nlpdef.notify_transaction(session_,
                                      n_rows=n_rows,
                                      n_bytes=sys.getsizeof(records_))

    def commit():
        for db in databases:
            nlpdef.commit(db['session'])

    # -------------------------------------------------------------------------
    # Main code
    # -------------------------------------------------------------------------
    # Use info log level, otherwise it looks like our code hangs with very
    # large databases.

    log.info("delete_where_no_source: examining source table {}.{}; "
             "MAY BE SLOW".format(ifconfig.get_srcdb(),
                                  ifconfig.get_srctable()))

    # Start our list with the progress database
    databases = [{
        'session': nlpdef.get_progdb_session(),
        'engine': nlpdef.get_progdb_engine(),
        'metadata': nlpdef.get_progdb_metadata(),
        'temptable': None,  # type: Table
    }]

    # Add the processors' destination databases
    for processor in nlpdef.get_processors():  # of type BaseNlpParser
        session = processor.get_session()
        if any(x['session'] == session for x in databases):
            continue  # already exists
        databases.append({
            'session': session,
            'engine': processor.get_engine(),
            'metadata': processor.get_metadata(),
        })

    # Make a temporary table in each database (note: the Table objects become
    # affiliated to their engine, I think, so make separate ones for each).
    log.info("... using {n} destination database(s)".format(n=len(databases)))
    log.info("... dropping (if exists) and creating temporary table(s)")
    for database in databases:
        engine = database['engine']
        temptable = Table(
            nlpdef.get_temporary_tablename(),
            database['metadata'],
            Column(FN_SRCPKVAL, BigInteger),  # not PK, as may be a hash
            Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)),
            **TABLE_KWARGS)
        temptable.drop(engine, checkfirst=True)
        temptable.create(engine, checkfirst=True)
        database['temptable'] = temptable

    # Insert PKs into temporary tables

    n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable())
    log.info("... populating temporary table(s): {} records to go; working in "
             "chunks of {}".format(n, chunksize))
    i = 0
    records = []  # type: List[Dict[str, Any]]
    for pkval, pkstr in ifconfig.gen_src_pks():
        i += 1
        if report_every and i % report_every == 0:
            log.info("... src row# {} / {}".format(i, n))
        records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str, Any]]
    if records:  # remainder
        insert(records)

    # Commit
    commit()

    # Index, for speed
    log.info("... creating index(es) on temporary table(s)")
    for database in databases:
        temptable = database['temptable']  # type: Table
        index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL])
        index.create(database['engine'])

    # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.info("... deleting from progress/destination DBs where appropriate")

    # Delete from progress database
    prog_db = databases[0]
    prog_temptable = prog_db['temptable']
    ifconfig.delete_progress_records_where_srcpk_not(prog_temptable)

    # Delete from others
    for processor in nlpdef.get_processors():
        database = [
            x for x in databases if x['session'] == processor.get_session()
        ][0]
        temptable = database['temptable']
        processor.delete_where_srcpk_not(ifconfig, temptable)

    # Drop temporary tables
    log.info("... dropping temporary table(s)")
    for database in databases:
        database['temptable'].drop(database['engine'], checkfirst=True)

    # Commit
    commit()