def dataset_db_with_swap_table(metadata_db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect(database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute( """ CREATE TABLE IF NOT EXISTS dataset_test ( id INT, name VARCHAR(255), date DATE ); DELETE FROM dataset_test; INSERT INTO dataset_test values(1,'test','2022-01-01'); CREATE TABLE IF NOT EXISTS dataset_test_20220101t000000_swap ( id INT, name VARCHAR(255), date DATE ); DELETE FROM dataset_test_20220101t000000_swap; INSERT INTO dataset_test_20220101t000000_swap values(1,'test','2022-01-01'); INSERT INTO dataset_test_20220101t000000_swap values(2,'test_2','2022-01-02'); """ ) return database
def test_dataset(db): with psycopg2.connect( database_dsn(settings.DATABASES_DATA["my_database"]) ) as conn, conn.cursor() as cursor: cursor.execute( "CREATE TABLE IF NOT EXISTS foo AS SELECT a,b FROM (VALUES ('test',30)) AS temp_table(a,b);" ) cursor.execute( """ CREATE SCHEMA IF NOT EXISTS dataflow; CREATE TABLE IF NOT EXISTS dataflow.metadata ( id SERIAL, table_schema TEXT, table_name TEXT, source_data_modified_utc TIMESTAMP WITHOUT TIME ZONE, dataflow_swapped_tables_utc TIMESTAMP WITHOUT TIME ZONE, table_structure JSONB, data_ids TEXT[], data_type INTEGER NOT NULL, data_hash_v1 TEXT ); TRUNCATE TABLE dataflow.metadata; INSERT INTO dataflow.metadata (table_schema, table_name, source_data_modified_utc, table_structure, data_type) VALUES ('public', 'foo', '2021-01-01 00:00:00.0', '{"a":"text","b":"int"}', 1); """ ) conn.commit() return ("public", "foo")
def dataset_api_view_GET(request, dataset_id, source_table_id): source_table = get_object_or_404(SourceTable, id=source_table_id, dataset__id=dataset_id, dataset__deleted=False) search_after = request.GET.getlist("$searchAfter") with psycopg2.connect( database_dsn(settings.DATABASES_DATA[ source_table.database.memorable_name])) as connection: primary_key = _get_dataset_primary_key(connection, source_table.schema, source_table.table) if not primary_key: raise ValueError( f"Cannot order response without a primary key on the table: " f"`{source_table.schema}`.`{source_table.table}`") if search_after == []: sql = psycopg2.sql.SQL(""" select * from {}.{} order by {} """).format( psycopg2.sql.Identifier(source_table.schema), psycopg2.sql.Identifier(source_table.table), psycopg2.sql.SQL(",").join( map(psycopg2.sql.Identifier, primary_key)), ) else: sql = psycopg2.sql.SQL(""" select * from {}.{} where ({}) > ({}) order by {} """).format( psycopg2.sql.Identifier(source_table.schema), psycopg2.sql.Identifier(source_table.table), psycopg2.sql.SQL(",").join( map(psycopg2.sql.Identifier, primary_key)), psycopg2.sql.SQL(",").join(psycopg2.sql.Placeholder() * len(search_after)), psycopg2.sql.SQL(",").join( map(psycopg2.sql.Identifier, primary_key)), ) columns = _get_dataset_columns(connection, source_table) rows = _get_dataset_rows(connection, sql, query_args=search_after) return _get_streaming_http_response( StreamingHttpResponseWithoutDjangoDbConnection, request, primary_key, columns, rows, )
def test_search_after(self): # create django objects memorable_name = self.memorable_name table = self.table database = Database.objects.get_or_create(memorable_name=memorable_name)[0] data_grouping = DataGrouping.objects.get_or_create()[0] dataset = DataSet.objects.get_or_create(grouping=data_grouping)[0] source_table = SourceTable.objects.get_or_create( dataset=dataset, database=database, table=table )[0] # create external source table with psycopg2.connect( database_dsn(settings.DATABASES_DATA[memorable_name]) ) as conn, conn.cursor() as cur: sql = ''' create table {table} (id int primary key, name varchar(100)) '''.format( table=table ) cur.execute(sql) sql = '''insert into {table} values (%s, %s)'''.format(table=self.table) values = [(0, 'abigail'), (1, 'romeo')] cur.executemany(sql, values) url = '/api/v1/dataset/{}/{}?$searchAfter=0'.format(dataset.id, source_table.id) response = self.client.get(url) expected = {'headers': ['id', 'name'], 'values': [[1, 'romeo']], 'next': None} output = b'' for streaming_output in response.streaming_content: output = output + streaming_output output_dict = json.loads(output.decode('utf-8')) self.assertEqual(output_dict, expected)
def refresh_chart_data(chart_id): chart = models.ChartBuilderChart.objects.get(id=chart_id) original_table_name = chart.get_temp_table_name() query_log_params = { "run_by_user": chart.created_by, "page": 1, "page_size": None, } if chart.related_source is not None: # If the chart is created from a dataset source, get the latest copy # of the query from the source before rerunning query_log_params.update({ "sql": chart.related_source.get_chart_builder_query(), "connection": chart.related_source.database.memorable_name, }) else: # If there is no related source, rerun the original query query_log_params.update({ "sql": chart.query_log.sql, "query_id": chart.query_log.query_id, "connection": chart.query_log.connection, }) chart.query_log = QueryLog.objects.create(**query_log_params) chart.save() run_chart_builder_query(chart.id) chart.refresh_thumbnail() with psycopg2.connect( database_dsn( DATABASES_DATA[query_log_params["connection"]])) as conn: cursor = conn.cursor() cursor.execute(f"DROP TABLE IF EXISTS {original_table_name}")
def test_materialized_view_download(self, request_client, published): dsn = database_dsn(settings.DATABASES_DATA['my_database']) with psycopg2.connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE if not exists materialized_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE materialized_test_table; INSERT INTO materialized_test_table VALUES(1, 'record1'); INSERT INTO materialized_test_table VALUES(2, 'record2'); DROP MATERIALIZED VIEW IF EXISTS materialized_test_view; CREATE MATERIALIZED VIEW materialized_test_view AS SELECT * FROM materialized_test_table; ''') dataset = factories.DataSetFactory( user_access_type='REQUIRES_AUTHENTICATION', published=published) source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), schema='public', view='materialized_test_view', ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = request_client.get(source_view.get_absolute_url()) assert response.status_code == 200 assert ( b''.join(response.streaming_content) == b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n' ) assert EventLog.objects.count() == log_count + 1 assert (EventLog.objects.latest().event_type == EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD) assert (DataSet.objects.get( pk=dataset.id).number_of_downloads == download_count + 1)
def test_view_download(self, access_type, request_client, published): dsn = database_dsn(settings.DATABASES_DATA["my_database"]) with psycopg2.connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(""" CREATE TABLE if not exists download_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE download_test_table; INSERT INTO download_test_table VALUES(1, 'record1'); INSERT INTO download_test_table VALUES(2, 'record2'); CREATE OR REPLACE VIEW download_test_view AS SELECT * FROM download_test_table; """) dataset = factories.DataSetFactory(user_access_type=access_type, published=published) source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name="my_database"), schema="public", view="download_test_view", ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = request_client.get(source_view.get_absolute_url()) assert response.status_code == 200 assert ( b"".join(response.streaming_content) == b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n' ) assert EventLog.objects.count() == log_count + 1 assert EventLog.objects.latest( ).event_type == EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD assert DataSet.objects.get( pk=dataset.id).number_of_downloads == download_count + 1
def metadata_db(db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect( database_dsn(settings.DATABASES_DATA["my_database"]) ) as conn, conn.cursor() as cursor: cursor.execute( """ CREATE SCHEMA IF NOT EXISTS dataflow; CREATE TABLE IF NOT EXISTS dataflow.metadata ( id SERIAL, table_schema TEXT, table_name TEXT, source_data_modified_utc TIMESTAMP WITHOUT TIME ZONE, dataflow_swapped_tables_utc TIMESTAMP WITHOUT TIME ZONE, table_structure JSONB, data_ids TEXT[], data_type INTEGER NOT NULL, data_hash_v1 TEXT, primary_keys TEXT[] ); TRUNCATE TABLE dataflow.metadata; INSERT INTO dataflow.metadata ( table_schema, table_name, source_data_modified_utc, dataflow_swapped_tables_utc, table_structure, data_type ) VALUES ('public','table1','2020-09-02 00:01:00.0','2020-09-02 00:01:00.0','{"field1":"int","field2":"varchar"}',1), ('public','table2','2020-09-01 00:01:00.0','2020-09-02 00:01:00.0',NULL,1), ('public','table1','2020-01-01 00:01:00.0','2020-09-02 00:01:00.0',NULL,1), ('public','table4', NULL,'2021-12-01 00:00:00.0',NULL,1); """ ) conn.commit() return database
def test_friendly_exception_from_table_without_primary_key(self): """This test is in place to assert existing behaviour. It may well be reasonable to make this view work without primary keys on the table.""" # create django objects memorable_name = self.memorable_name table = self.table database = Database.objects.get_or_create( memorable_name=memorable_name)[0] data_grouping = DataGrouping.objects.get_or_create()[0] dataset = DataSet.objects.get_or_create(grouping=data_grouping)[0] source_table = SourceTable.objects.get_or_create(dataset=dataset, database=database, table=table)[0] # create external source table with psycopg2.connect( database_dsn(settings.DATABASES_DATA[memorable_name]) ) as conn, conn.cursor() as cur: sql = ''' create table {table} (id int, name varchar(100), timestamp timestamp) '''.format(table=table) cur.execute(sql) url = '/api/v1/dataset/{}/{}'.format(dataset.id, source_table.id) with pytest.raises(ValueError) as e: self.client.get(url) assert str(e.value) == ( f"Cannot order response without a primary key on the table: " f"`{source_table.schema}`.`{source_table.table}`")
def test_view_download(self): dsn = database_dsn(settings.DATABASES_DATA['my_database']) with connect(dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE if not exists download_test_table (field2 int,field1 varchar(255)); TRUNCATE TABLE download_test_table; INSERT INTO download_test_table VALUES(1, 'record1'); INSERT INTO download_test_table VALUES(2, 'record2'); CREATE OR REPLACE VIEW download_test_view AS SELECT * FROM download_test_table; ''') dataset = factories.DataSetFactory( user_access_type='REQUIRES_AUTHENTICATION') source_view = factories.SourceViewFactory( dataset=dataset, database=factories.DatabaseFactory(memorable_name='my_database'), schema='public', view='download_test_view', ) log_count = EventLog.objects.count() download_count = dataset.number_of_downloads response = self._authenticated_get(source_view.get_absolute_url()) self.assertEqual(response.status_code, 200) self.assertEqual( b''.join(response.streaming_content), b'"field2","field1"\r\n1,"record1"\r\n2,"record2"\r\n"Number of rows: 2"\r\n', ) self.assertEqual(EventLog.objects.count(), log_count + 1) self.assertEqual( EventLog.objects.latest().event_type, EventLog.TYPE_DATASET_SOURCE_VIEW_DOWNLOAD, ) self.assertEqual( DataSet.objects.get(pk=dataset.id).number_of_downloads, download_count + 1)
def test_friendly_exception_from_non_standard_table(self): # create django objects memorable_name = self.memorable_name table = self.table database = Database.objects.get_or_create( memorable_name=memorable_name)[0] data_grouping = DataGrouping.objects.get_or_create()[0] dataset = DataSet.objects.get_or_create(grouping=data_grouping)[0] _ = SourceTable.objects.get_or_create(dataset=dataset, database=database, table=table)[0] view_source_table = SourceTable.objects.get_or_create( dataset=dataset, database=database, table=f"view_{table}")[0] # create external source table with psycopg2.connect( database_dsn(settings.DATABASES_DATA[memorable_name]) ) as conn, conn.cursor() as cur: sql = ''' create table {table} (id int primary key, name varchar(100), timestamp timestamp) '''.format(table=table) cur.execute(sql) sql = ''' create view view_{table} as (select * from {table}) '''.format(table=table) cur.execute(sql) url = '/api/v1/dataset/{}/{}'.format(dataset.id, view_source_table.id) with pytest.raises(ValueError) as e: self.client.get(url) assert str(e.value) == ( "Cannot get primary keys from something other than an ordinary table. " "`public`.`view_test_source_table` is a: view")
def get_postgres_column_names_data_types(sourcetable): with connect( database_dsn( settings.DATABASES_DATA[sourcetable.database.memorable_name]) ) as conn, conn.cursor() as cur: cur.execute( """ SELECT pg_attribute.attname AS column_name, pg_catalog.format_type(pg_attribute.atttypid, pg_attribute.atttypmod) AS data_type FROM pg_catalog.pg_attribute INNER JOIN pg_catalog.pg_class ON pg_class.oid = pg_attribute.attrelid INNER JOIN pg_catalog.pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE pg_attribute.attnum > 0 AND NOT pg_attribute.attisdropped AND pg_namespace.nspname = %s AND pg_class.relname = %s ORDER BY attnum ASC; """, (sourcetable.schema, sourcetable.table), ) return cur.fetchall()
def test_empty_data(self): # create django objects memorable_name = self.memorable_name table = self.table database = Database.objects.get_or_create( memorable_name=memorable_name)[0] data_grouping = DataGrouping.objects.get_or_create()[0] dataset = DataSet.objects.get_or_create(grouping=data_grouping)[0] source_table = SourceTable.objects.get_or_create(dataset=dataset, database=database, table=table)[0] # create external source table with psycopg2.connect( database_dsn(settings.DATABASES_DATA[memorable_name]) ) as conn, conn.cursor() as cur: sql = ''' create table {table} (id int primary key, name varchar(100), timestamp timestamp) '''.format(table=table) cur.execute(sql) url = '/api/v1/dataset/{}/{}'.format(dataset.id, source_table.id) response = self.client.get(url) expected = { 'headers': ['id', 'name', 'timestamp'], 'next': None, 'values': [] } output = b'' for streaming_output in response.streaming_content: output = output + streaming_output output_dict = json.loads(output.decode('utf-8')) self.assertEqual(output_dict, expected)
def test_db(self, db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect( database_dsn(settings.DATABASES_DATA['my_database'])) as conn: conn.cursor().execute(''' CREATE TABLE IF NOT EXISTS test_table AS ( SELECT 1 as a, 2 as b ); ''') conn.commit() yield database conn.cursor().execute('DROP TABLE test_table')
def run_chart_builder_query(chart_id): chart = models.ChartBuilderChart.objects.get(id=chart_id) query_log = chart.query_log with psycopg2.connect(database_dsn( DATABASES_DATA[query_log.connection])) as conn: _run_query( conn, query_log, query_log.page, query_log.page_size, settings.EXPLORER_QUERY_TIMEOUT_MS, chart.get_temp_table_name(), )
def dataset_api_view_GET(request, dataset_id, source_table_id): source_table = get_object_or_404( SourceTable, id=source_table_id, dataset__id=dataset_id, dataset__deleted=False ) search_after = request.GET.getlist('$searchAfter') with psycopg2.connect( database_dsn(settings.DATABASES_DATA[source_table.database.memorable_name]) ) as connection: primary_key = _get_dataset_primary_key( connection, source_table.schema, source_table.table ) if search_after == []: sql = psycopg2.sql.SQL( ''' select * from {}.{} order by {} ''' ).format( psycopg2.sql.Identifier(source_table.schema), psycopg2.sql.Identifier(source_table.table), psycopg2.sql.SQL(',').join(map(psycopg2.sql.Identifier, primary_key)), ) else: sql = psycopg2.sql.SQL( ''' select * from {}.{} where ({}) > ({}) order by {} ''' ).format( psycopg2.sql.Identifier(source_table.schema), psycopg2.sql.Identifier(source_table.table), psycopg2.sql.SQL(',').join(map(psycopg2.sql.Identifier, primary_key)), psycopg2.sql.SQL(',').join( psycopg2.sql.Placeholder() * len(search_after) ), psycopg2.sql.SQL(',').join(map(psycopg2.sql.Identifier, primary_key)), ) columns = _get_dataset_columns(connection, source_table) rows = _get_dataset_rows(connection, sql, query_args=search_after) return _get_streaming_http_response(request, primary_key, columns, rows)
def get_columns(database_name, schema=None, table=None, query=None): if table is not None and schema is not None: source = sql.SQL("{}.{}").format(sql.Identifier(schema), sql.Identifier(table)) elif query is not None: source = sql.SQL("({}) AS custom_query".format(query.rstrip(";"))) else: raise ValueError("Either table or query are required") with psycopg2.connect( database_dsn(settings.DATABASES_DATA[database_name]) ) as connection: try: return query_columns(connection, source) except Exception: logger.error("Failed to get dataset fields", exc_info=True) return []
def setUp(self): super().setUp() self.database = factories.DatabaseFactory(memorable_name='my_database') self.dsn = database_dsn(settings.DATABASES_DATA['my_database']) with connect(self.dsn) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE TABLE IF NOT EXISTS custom_query_test ( id INT, name VARCHAR(255), date DATE ); TRUNCATE TABLE custom_query_test; INSERT INTO custom_query_test VALUES(1, 'the first record', NULL); INSERT INTO custom_query_test VALUES(2, 'the second record', '2019-01-01'); INSERT INTO custom_query_test VALUES(3, 'the last record', NULL); ''')
def metadata_db(db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect(database_dsn(settings.DATABASES_DATA['my_database']) ) as conn, conn.cursor() as cursor: cursor.execute(''' CREATE SCHEMA IF NOT EXISTS dataflow; CREATE TABLE IF NOT EXISTS dataflow.metadata ( id int, table_schema text, table_name text, source_data_modified_utc timestamp ); TRUNCATE TABLE dataflow.metadata; INSERT INTO dataflow.metadata VALUES(1, 'public', 'table1', '2020-09-02 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table2', '2020-09-01 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table1', '2020-01-01 00:01:00.0'); INSERT INTO dataflow.metadata VALUES(1, 'public', 'table4', NULL); ''') conn.commit() return database
def dataset_finder_db(metadata_db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect(database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute( """ CREATE TABLE IF NOT EXISTS dataworkspace__source_tables ( id INT, name VARCHAR(255), dataset_id UUID, schema VARCHAR(255), "table" VARCHAR(255) ); CREATE TABLE IF NOT EXISTS dataworkspace__catalogue_items ( id UUID, name VARCHAR(255), slug VARCHAR(255) ); INSERT INTO dataworkspace__source_tables VALUES( 1, 'public.data', '0dea6147-d355-4b6d-a140-0304ef9cfeca', 'public', 'data' ); INSERT INTO dataworkspace__catalogue_items VALUES( '0dea6147-d355-4b6d-a140-0304ef9cfeca', 'public.data', '1' ); CREATE SCHEMA IF NOT EXISTS public; CREATE TABLE IF NOT EXISTS data ( id int, name VARCHAR(255), database VARCHAR(255), schema VARCHAR(255), frequency VARCHAR(255), "table" VARCHAR(255) ); CREATE TABLE IF NOT EXISTS country_stats ( date DATE, driving NUMERIC, country VARCHAR(255) ); """ ) return database
def test_data(self): # create django objects memorable_name = self.memorable_name table = self.table database = Database.objects.get_or_create( memorable_name=memorable_name)[0] data_grouping = DataGrouping.objects.get_or_create()[0] dataset = DataSet.objects.get_or_create(grouping=data_grouping)[0] source_table = SourceTable.objects.get_or_create(dataset=dataset, database=database, table=table)[0] # create external source table with psycopg2.connect( database_dsn(settings.DATABASES_DATA[memorable_name]) ) as conn, conn.cursor() as cur: sql = """ create table {table} (id int primary key, name varchar(100), timestamp timestamp) """.format(table=table) cur.execute(sql) sql = """insert into {table} values (%s, %s, %s)""".format( table=self.table) values = [ (0, "abigail", "2019-01-01 01:00"), (1, "romeo", "2019-01-01 02:00"), ] cur.executemany(sql, values) url = "/api/v1/dataset/{}/{}".format(dataset.id, source_table.id) response = self.client.get(url) expected = { "headers": ["id", "name", "timestamp"], "next": None, "values": [ [0, "abigail", "2019-01-01 01:00:00"], [1, "romeo", "2019-01-01 02:00:00"], ], } output = b"" for streaming_output in response.streaming_content: output = output + streaming_output output_dict = json.loads(output.decode("utf-8")) self.assertEqual(output_dict, expected)
def test_db(self, db): database = factories.DatabaseFactory(memorable_name="my_database") with psycopg2.connect( database_dsn(settings.DATABASES_DATA["my_database"])) as conn: conn.cursor().execute(""" CREATE TABLE IF NOT EXISTS test_sample AS ( with data (x, y, z) as (values (NULL, NULL, NULL), ('a', 'b', 'c'), ('a', 'b', NULL), ('a', NULL, NULL) ) SELECT * from data ); """) conn.commit() yield database.memorable_name conn.cursor().execute("DROP TABLE test_sample")
def dataset_db(metadata_db): database = factories.DatabaseFactory(memorable_name='my_database') with psycopg2.connect(database_dsn( settings.DATABASES_DATA['my_database'])) as conn: conn.cursor().execute(''' CREATE TABLE IF NOT EXISTS dataset_test ( id INT, name VARCHAR(255), date DATE ); CREATE TABLE IF NOT EXISTS dataset_test2 ( id INT, name VARCHAR(255) ); CREATE OR REPLACE VIEW dataset_view AS (SELECT * FROM dataset_test); ''') return database
def handle(self, *args, **options): self.stdout.write("ensure_database_configured...") for database_name, database_data in settings.DATABASES_DATA.items(): self.stdout.write( f"ensure_database_configured ensuring {database_name} is in db" ) Database.objects.get_or_create(memorable_name=database_name) self.stdout.write( f"ensure_database_configured {database_name} revoking public access..." ) with connect( database_dsn(database_data)) as conn, conn.cursor() as cur: cur.execute("REVOKE ALL ON schema public FROM public;") self.stdout.write( f"ensure_database_configured {database_name} revoking public access... (done)" ) self.stdout.write( self.style.SUCCESS("ensure_database_configured... (done)"))
def handle(self, *args, **options): self.stdout.write("deleting temporary Data Explorer result tables...") for _database_name, database_data in settings.DATABASES_DATA.items(): with connect( database_dsn(database_data)) as conn, conn.cursor() as cur: cur.execute(""" SELECT table_schema, table_name FROM information_schema.tables WHERE table_name LIKE '%_tmp_query_%' """) for table_schema, table_name in cur.fetchall(): cur.execute( psycopg2.sql.SQL("DROP TABLE {}.{}").format( psycopg2.sql.Identifier(table_schema), psycopg2.sql.Identifier(table_name), )) self.stdout.write( self.style.SUCCESS( "deleting temporary Data Explorer result tables (done)"))
def get_rows(sourcetable, schema_value_funcs, query_var): cursor_itersize = 1000 # Order the rows by primary key so # - multiple requests are consistent with each other; # - and specifically and more importantly, sopaginated results are # consistent with each other # We make no assumption on the name or number of columns in the primary # key, other than it exists # We _could_ use `oid` to order rows if there is no primary key, but we # would like all tables to have a primary key, so we deliberately don't # implement this with connect( database_dsn( settings.DATABASES_DATA[sourcetable.database.memorable_name]) ) as conn, conn.cursor() as cur: cur.execute( """ SELECT pg_attribute.attname AS column_name FROM pg_catalog.pg_class pg_class_table INNER JOIN pg_catalog.pg_index ON pg_index.indrelid = pg_class_table.oid INNER JOIN pg_catalog.pg_class pg_class_index ON pg_class_index.oid = pg_index.indexrelid INNER JOIN pg_catalog.pg_namespace ON pg_namespace.oid = pg_class_table.relnamespace INNER JOIN pg_catalog.pg_attribute ON pg_attribute.attrelid = pg_class_index.oid WHERE pg_namespace.nspname = %s AND pg_class_table.relname = %s AND pg_index.indisprimary ORDER BY pg_attribute.attnum """, (sourcetable.schema, sourcetable.table), ) primary_key_column_names = [row[0] for row in cur.fetchall()] with connect( database_dsn(settings.DATABASES_DATA[ sourcetable.database.memorable_name])) as conn, conn.cursor( name="google_data_studio_all_table_data" ) as cur: # Named cursor => server-side cursor cur.itersize = cursor_itersize cur.arraysize = cursor_itersize fields_sql = sql.SQL(",").join([ sql.Identifier(schema["name"]) for schema, _ in schema_value_funcs ]) primary_key_sql = sql.SQL(",").join([ sql.Identifier(column_name) for column_name in primary_key_column_names ]) schema_sql = sql.Identifier(sourcetable.schema) table_sql = sql.Identifier(sourcetable.table) query_sql, vars_sql = query_var(fields_sql, schema_sql, table_sql, primary_key_sql) cur.execute(query_sql, vars_sql) while True: rows = cur.fetchmany(cursor_itersize) for row in rows: primary_key_values = row[:len(primary_key_column_names)] requested_field_values = row[len(primary_key_column_names):] values = [ schema_value_funcs[i][1](value) for i, value in enumerate(requested_field_values) ] yield {"values": values}, primary_key_values if not rows: break
def flush_database(self): with psycopg2.connect( database_dsn( settings.DATABASES_DATA[self.memorable_name])) as conn: flush_database(conn)
def delete_unused_datasets_users(): logger.info('delete_unused_datasets_users: Start') for memorable_name, database_data in settings.DATABASES_DATA.items(): database_obj = Database.objects.get(memorable_name=memorable_name) database_name = database_data['NAME'] with connect( database_dsn(database_data)) as conn, conn.cursor() as cur: logger.info('delete_unused_datasets_users: finding database users') cur.execute(""" SELECT usename FROM pg_catalog.pg_user WHERE valuntil != 'infinity' AND usename LIKE 'user_%' ORDER BY usename; """) usenames = [result[0] for result in cur.fetchall()] logger.info('delete_unused_datasets_users: finding schemas') cur.execute(""" SELECT nspname FROM pg_catalog.pg_namespace WHERE nspname != 'pg_catalog' AND nspname != 'information_schema' ORDER BY nspname """) schemas = [result[0] for result in cur.fetchall()] logger.info( 'delete_unused_datasets_users: waiting in case they were just created' ) gevent.sleep(15) # We want to be able to delete db users created, but then _not_ associated with an # running application, such as those from a STOPPED application, but also from those # that were created but then the server went down before the application was created. in_use_usenames = set( ApplicationInstanceDbUsers.objects.filter( db=database_obj, db_username__in=usenames, application_instance__state__in=['RUNNING', 'SPAWNING'], ).values_list('db_username', flat=True)) not_in_use_usernames = [ usename for usename in usenames if usename not in in_use_usenames ] schema_revokes = [ 'REVOKE USAGE ON SCHEMA {} FROM {};', 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA {} FROM {};', 'REVOKE ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA {} FROM {};', 'REVOKE ALL PRIVILEGES ON ALL FUNCTIONS IN SCHEMA {} FROM {};', 'ALTER DEFAULT PRIVILEGES IN SCHEMA {} REVOKE ALL PRIVILEGES ON TABLES FROM {}', 'ALTER DEFAULT PRIVILEGES IN SCHEMA {} REVOKE ALL PRIVILEGES ON SEQUENCES FROM {}', 'ALTER DEFAULT PRIVILEGES IN SCHEMA {} REVOKE ALL PRIVILEGES ON FUNCTIONS FROM {}', ] with connect(database_dsn(database_data)) as conn: conn.autocommit = True with conn.cursor() as cur: for usename in not_in_use_usernames: try: logger.info( 'delete_unused_datasets_users: revoking credentials for %s', usename, ) # Multiple concurrent GRANT CONNECT on the same database can cause # "tuple concurrently updated" errors with cache.lock( f'database-grant-connect-{database_name}--v2', blocking_timeout=3, ): cur.execute( sql.SQL( 'REVOKE CONNECT ON DATABASE {} FROM {};'). format( sql.Identifier(database_name), sql.Identifier(usename), )) cur.execute( sql.SQL( 'REVOKE ALL PRIVILEGES ON DATABASE {} FROM {};' ).format( sql.Identifier(database_name), sql.Identifier(usename), )) for schema in schemas: with cache.lock( f'database-grant--{database_name}--{schema}--v2', blocking_timeout=3, ): for schema_revoke in schema_revokes: try: cur.execute( sql.SQL(schema_revoke).format( sql.Identifier(schema), sql.Identifier(usename), )) except Exception: # This is likely to happen for private schemas where the current user # does not have revoke privileges. We carry on in a best effort # to remove the user logger.info( 'delete_unused_datasets_users: Unable to %s %s %s', schema_revoke, schema, usename, ) logger.info( 'delete_unused_datasets_users: dropping user %s', usename) cur.execute( sql.SQL('DROP USER {};').format( sql.Identifier(usename))) except Exception: logger.exception( 'delete_unused_datasets_users: Failed deleting %s', usename) else: logger.info( 'delete_unused_datasets_users: revoked credentials for and dropped %s', usename, ) logger.info('delete_unused_datasets_users: End')
def _get_dsn(self): return database_dsn(settings.DATABASES_DATA['my_database'])