def snapshot_converter(db: BaseDb, snapshot_d: Dict[str, Any]) -> Snapshot: """Convert snapshot from the flat representation to swh model compatible objects. """ columns = ["name", "target", "target_type"] query = """ select %s from snapshot_branches sbs inner join snapshot_branch sb on sb.object_id=sbs.branch_id where sbs.snapshot_id=%%s """ % ", ".join(columns) with db.cursor() as cur: cur.execute(query, (snapshot_d["object_id"], )) branches = {} for name, *row in cur: branch_d = dict(zip(columns[1:], row)) if branch_d["target"] is not None and branch_d[ "target_type"] is not None: branch: Optional[SnapshotBranch] = SnapshotBranch( target=branch_d["target"], target_type=TargetType(branch_d["target_type"]), ) else: branch = None branches[name] = branch return Snapshot( id=snapshot_d["id"], branches=branches, )
def test_cli_swh_db_create_and_init_db_new_api(cli_runner, postgresql, mock_import_swhmodule, mocker, tmp_path): """Create a db then initializing it should be ok for a "new style" datastore""" module_name = "test.cli_new" conninfo = craft_conninfo(postgresql) # This initializes the schema and data cfgfile = tmp_path / "config.yml" cfgfile.write_text( yaml.dump({module_name: { "cls": "postgresql", "db": conninfo }})) result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["-C", cfgfile, "init", module_name]) assert (result.exit_code == 0 ), f"Unexpected output: {traceback.print_tb(result.exc_info[2])}" # the origin value in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, installed during db creation step) with BaseDb.connect(conninfo).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1
def iter_revision_rows(storage_dbconn: str, first_id: Sha1Git): after_id = first_id failures = 0 while True: try: storage_db = BaseDb.connect(storage_dbconn) with storage_db.cursor() as cur: while True: cur.execute( f"SELECT {', '.join(REVISION_COLS)} FROM revision " f"WHERE id >= %s AND metadata IS NOT NULL AND type != 'git'" f"ORDER BY id LIMIT 1000", (after_id, ), ) new_rows = 0 for row in cur: new_rows += 1 row_d = dict(zip(REVISION_COLS, row)) yield row_d after_id = row_d["id"] if new_rows == 0: return except psycopg2.OperationalError as e: print(e) # most likely a temporary error, try again if failures >= 60: raise else: time.sleep(60) failures += 1
def test_smoke_test_fun_db_is_still_up_and_got_reset(postgres_fun): """This ensures that within another tests, the 'fun' db is still up, created (and not configured again). This time, most of the data has been reset: - except for tables 'dbversion' and 'people' which were left as is - the other tables from the schema (here only "fun") got truncated - the sequences got truncated as well """ with BaseDb.connect(postgres_fun.dsn).cursor() as cur: # db version is excluded from the truncate cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 # people is also allowed not to be truncated cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 # table and sequence are reset cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 0 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 1
def test_cli_swh_db_initialization_idempotent(swh_db_cli, mock_package_sql, test_db): """Multiple runs of the init commands are idempotent """ module_name = "anything" # mocked cli_runner, db_params = swh_db_cli result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--db-name", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--db-name", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--db-name", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--db-name", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) with BaseDb.connect(test_db.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1
def run(self, object_type, start_object, end_object, dry_run=False): """Reads storage's subscribed object types and send them to the journal's reading topic. """ start_object, end_object = self.parse_arguments( object_type, start_object, end_object) db = BaseDb.connect(self.config["storage"]["db"]) writer = JournalWriter({ "cls": "kafka", **self.config["journal_writer"] }) assert writer.journal is not None for range_start, range_end in RANGE_GENERATORS[object_type]( start_object, end_object): logger.info( "Processing %s range %s to %s", object_type, _format_range_bound(range_start), _format_range_bound(range_end), ) objects = fetch(db, object_type, start=range_start, end=range_end) if not dry_run: writer.write_additions(object_type, objects) else: # only consume the objects iterator to check for any potential # decoding/encoding errors for obj in objects: pass
def test_smoke_test_db_no_init(postgres_no_init): """We can connect to the db nonetheless """ with BaseDb.connect(postgres_no_init.dsn).cursor() as cur: cur.execute("select now()") data = cur.fetchone()[0] assert data is not None
def db_with_data(test_db, request): """Fixture to initialize a db with some data out of the "INIT_SQL above""" db = BaseDb.connect(test_db.dsn) with db.cursor() as cur: psycopg2.extras.register_default_jsonb(cur) cur.execute(INIT_SQL) yield db db.conn.rollback() db.conn.close()
def main(storage_dbconn, storage_url, deposit_dbconn, first_id, limit, dry_run): storage_db = BaseDb.connect(storage_dbconn) deposit_db = BaseDb.connect(deposit_dbconn) storage = get_storage( "pipeline", steps=[ { "cls": "retry" }, { "cls": "postgresql", "db": storage_dbconn, "objstorage": { "cls": "memory", "args": {} }, }, ], ) if not dry_run: create_fetchers(storage_db) # Not creating authorities, as the loaders are presumably already running # and created them already. # This also helps make sure this script doesn't accidentally create # authorities that differ from what the loaders use. total_rows = 0 with deposit_db.cursor() as deposit_cur: rows = iter_revision_rows(storage_dbconn, first_id) if limit is not None: rows = itertools.islice(rows, limit) for row in rows: handle_row(row, storage, deposit_cur, dry_run) total_rows += 1 if total_rows % 1000 == 0: percents = (int.from_bytes(row["id"][0:4], byteorder="big") * 100 / (1 << 32)) print(f"Processed {total_rows/1000000.:.2f}M rows " f"(~{percents:.1f}%, last revision: {row['id'].hex()})")
def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = BaseDb(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db, cursor_factory=psycopg2.extras.RealDictCursor, ) self._db = None
def test_smoke_test_people_db_up(postgres_people): """'people' db is up and configured""" with BaseDb.connect(postgres_people.dsn).cursor() as cur: cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 3 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 2
def test_smoke_test_fun2_db_is_up(postgres_fun2): """This ensures the db is created and configured according to its dumps files.""" with BaseDb.connect(postgres_fun2.dsn).cursor() as cur: cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 3 cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 # in data, we requested a value already so it starts at 2 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 2
def test_cli_swh_db_initialization_with_env(swh_db_cli, mock_import_swhmodule, postgresql): """Init commands with standard environment variables works""" module_name = "test.cli" # it's mocked here cli_runner, db_params = swh_db_cli result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", db_params["dbname"]]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--dbname", db_params["dbname"]]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) with BaseDb.connect(postgresql.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1
def test_db_utils_swh_db_upgrade_sanity_checks( cli_runner, postgresql, mock_import_swhmodule, module, datadir ): """Check swh_db_upgrade""" conninfo = craft_conninfo(postgresql) result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["init", module, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" cnx = BaseDb.connect(conninfo) with cnx.transaction() as cur: cur.execute("drop table dbmodule") # try to upgrade with a unset module with pytest.raises(ValueError): swh_db_upgrade(conninfo, module) # check the dbmodule is unset assert swh_db_module(conninfo) is None # set the stored module to something else swh_set_db_module(conninfo, f"{module}2") assert swh_db_module(conninfo) == f"{module}2" # try to upgrade with a different module with pytest.raises(ValueError): swh_db_upgrade(conninfo, module) # revert to the proper module in the db swh_set_db_module(conninfo, module, force=True) assert swh_db_module(conninfo) == module # trying again is a noop swh_set_db_module(conninfo, module) assert swh_db_module(conninfo) == module # drop the dbversion table with cnx.transaction() as cur: cur.execute("drop table dbversion") # an upgrade should fail due to missing stored version with pytest.raises(ValueError): swh_db_upgrade(conninfo, module)
def test_cli_swh_db_initialization_works_with_flags(cli_runner, postgresql, mock_import_swhmodule): """Init commands with carefully crafted libpq conninfo works""" module_name = "test.cli" # it's mocked here conninfo = craft_conninfo(postgresql) result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) with BaseDb.connect(postgresql.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1
def test_smoke_test_people_db_up_and_reset(postgres_people): """'people' db is up and got reset on every tables and sequences""" with BaseDb.connect(postgres_people.dsn).cursor() as cur: # tables are truncated after the first round cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 0 # tables are truncated after the first round cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 0 # table and sequence are reset cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 0 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 1
def directory_converter(db: BaseDb, directory_d: Dict[str, Any]) -> Directory: """Convert directory from the flat representation to swh model compatible objects. """ columns = ["target", "name", "perms"] query_template = """ select %(columns)s from directory_entry_%(type)s where id in %%s """ types = ["file", "dir", "rev"] entries = [] with db.cursor() as cur: for type in types: ids = directory_d.pop("%s_entries" % type) if not ids: continue query = query_template % { "columns": ",".join(columns), "type": type, } cur.execute(query, (tuple(ids), )) for row in cur: entry_d = dict(zip(columns, row)) entry = DirectoryEntry( name=entry_d["name"], type=type, target=entry_d["target"], perms=entry_d["perms"], ) entries.append(entry) return Directory( id=directory_d["id"], entries=tuple(entries), raw_manifest=directory_d["raw_manifest"], )
def test_cli_swh_db_create_and_init_db(cli_runner, test_db, mock_package_sql): """Create a db then initializing it should be ok """ module_name = "something" conninfo = craft_conninfo(test_db, "new-db") # This creates the db and installs the necessary admin extensions result = cli_runner.invoke(swhdb, ["create", module_name, "--db-name", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # This initializes the schema and data result = cli_runner.invoke(swhdb, ["init", module_name, "--db-name", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin value in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, installed during db creation step) with BaseDb.connect(conninfo).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1
def test_db_utils_upgrade( cli_runner, postgresql, mock_import_swhmodule, module, datadir ): """Check swh_db_upgrade""" conninfo = craft_conninfo(postgresql) result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["init", module, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" assert swh_db_version(conninfo) == 1 new_version = swh_db_upgrade(conninfo, module) assert new_version == 6 assert swh_db_version(conninfo) == 6 versions = swh_db_versions(conninfo) # get rid of dates to ease checking versions = [(v[0], v[2]) for v in versions] assert versions[-1] == (1, "DB initialization") sqlbasedir = path.join(datadir, module.split(".", 1)[1], "sql", "upgrades") assert versions[1:-1] == [ (i, f"Upgraded to version {i} using {sqlbasedir}/{i:03d}.sql") for i in range(5, 1, -1) ] assert versions[0] == (6, "Updated version from upgrade script") cnx = BaseDb.connect(conninfo) with cnx.transaction() as cur: cur.execute("select url from origin where url like 'version%'") result = cur.fetchall() assert result == [("version%03d" % i,) for i in range(2, 7)] cur.execute( "select url from origin where url = 'this should never be executed'" ) result = cur.fetchall() assert not result
def test_db_utils_versions(cli_runner, postgresql, mock_import_swhmodule, module): """Check get_database_info, swh_db_versions and swh_db_module work ok This test checks db versions for both a db with "new style" set of sql init scripts (i.e. the dbversion table is not created in these scripts, but by the populate_database_for_package() function directly, via the 'swh db init' command) and an "old style" set (dbversion created in the scripts)S. """ conninfo = craft_conninfo(postgresql) result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module, "--dbname", conninfo, "--initial-version", 10] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" # check the swh_db_module() function assert swh_db_module(conninfo) == module # the dbversion and dbmodule tables exists and are populated dbmodule, dbversion, dbflavor = get_database_info(conninfo) # check also the swh_db_versions() function versions = swh_db_versions(conninfo) assert dbmodule == module assert dbversion == 10 assert dbflavor is None # check also the swh_db_versions() function versions = swh_db_versions(conninfo) assert len(versions) == 1 assert versions[0][0] == 10 if module == "test.cli": assert versions[0][1] == datetime.fromisoformat( "2016-02-22T15:56:28.358587+00:00" ) assert versions[0][2] == "Work In Progress" else: # new scheme but with no datastore (so no version support from there) assert versions[0][2] == "DB initialization" # add a few versions in dbversion cnx = BaseDb.connect(conninfo) with cnx.transaction() as cur: cur.executemany( "insert into dbversion(version, release, description) values (%s, %s, %s)", [(i, now(), f"Upgrade to version {i}") for i in range(11, 15)], ) dbmodule, dbversion, dbflavor = get_database_info(conninfo) assert dbmodule == module assert dbversion == 14 assert dbflavor is None versions = swh_db_versions(conninfo) assert len(versions) == 5 for i, (version, ts, desc) in enumerate(versions): assert version == (14 - i) # these are in reverse order if version > 10: assert desc == f"Upgrade to version {version}" assert (now() - ts) < timedelta(seconds=1)
def test_cli_swh_db_upgrade_new_api(cli_runner, postgresql, datadir, mocker, tmp_path): """Upgrade scenario for a "new style" datastore""" module_name = "test.cli_new" # the `current_version` variable is the version that will be returned by # any call to `get_current_version()` in this test session, thanks to the # local mocked version of import_swhmodule() below. current_version = 1 # custom version of the mockup to make it easy to change the # current_version returned by get_current_version() # TODO: find a better solution for this... def import_swhmodule_mock(modname): if modname.startswith("test."): dirname = modname.split(".", 1)[1] def get_datastore(cls, **kw): return mocker.MagicMock( get_current_version=lambda: current_version) return mocker.MagicMock( __name__=modname, __file__=os.path.join(datadir, dirname, "__init__.py"), name=modname, get_datastore=get_datastore, ) return import_swhmodule(modname) mocker.patch("swh.core.db.db_utils.import_swhmodule", import_swhmodule_mock) conninfo = craft_conninfo(postgresql) # This initializes the schema and data cfgfile = tmp_path / "config.yml" cfgfile.write_text( yaml.dump({module_name: { "cls": "postgresql", "db": conninfo }})) result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["-C", cfgfile, "init", module_name]) assert (result.exit_code == 0 ), f"Unexpected output: {traceback.print_tb(result.exc_info[2])}" assert swh_db_version(conninfo) == 1 # the upgrade should not do anything because the datastore does advertise # version 1 result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) assert swh_db_version(conninfo) == 1 # advertise current version as 3, a simple upgrade should get us there, but # no further current_version = 3 result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) assert swh_db_version(conninfo) == 3 # an attempt to go further should not do anything result = cli_runner.invoke( swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 5]) assert swh_db_version(conninfo) == 3 # an attempt to go lower should not do anything result = cli_runner.invoke( swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 2]) assert swh_db_version(conninfo) == 3 # advertise current version as 6, an upgrade with --to-version 4 should # stick to the given version 4 and no further current_version = 6 result = cli_runner.invoke( swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 4]) assert swh_db_version(conninfo) == 4 assert "migration was not complete" in result.output # attempt to upgrade to a newer version than current code version fails result = cli_runner.invoke( swhdb, [ "-C", cfgfile, "upgrade", module_name, "--to-version", current_version + 1 ], ) assert result.exit_code != 0 assert swh_db_version(conninfo) == 4 cnx = BaseDb.connect(conninfo) with cnx.transaction() as cur: cur.execute("drop table dbmodule") assert swh_db_module(conninfo) is None # db migration should recreate the missing dbmodule table result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) assert result.exit_code == 0 assert "Warning: the database does not have a dbmodule table." in result.output assert ( "Write the module information (test.cli_new) in the database? [Y/n]" in result.output) assert swh_db_module(conninfo) == module_name
def get_db(self): if self._db: return self._db return BaseDb.from_pool(self._pool)