def list_all_tables(db_uri=None): ''' Print out a list of all tables in anon.db Parameters ---------- db_uri : string or None added so that we can test the function using SQLite in-memory DB. Returns ------- Prints out a simple list to console ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) with closing(conn): c = conn.cursor() c.execute('SELECT name from sqlite_master where type= "table"') table_names = c.fetchall() print([tbl[0] for tbl in table_names])
def number_of_table_columns(table_name, db_uri=None): ''' Returns the number of columns of a given table Parameters ---------- table_name : str table in anon.db to query Returns ------- Count of columns ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" sql = f"PRAGMA TABLE_INFO({table_name})" conn = sqlite3.connect(db_uri, uri=True) #fetchall will return a list with a tuple for each column with closing(conn): c = conn.cursor() c.execute(sql) result = len(c.fetchall()) return result
def test_overlapping_hierarchical_and_predefined_linked_columns(self): ''' When there is a conflict between a user defined and hierarchical linked columns, user defined list wins which means that the columns that make up the user defined list are excluded from consideration in the discovery phase of the hierarchical linkage. Make sure you include hb_code as well, otherwise hb_code will be linked to loc_name (correctly, but unexpectedly) ''' user_linked_cols = ["hb_name", "hb_code", "age"] args = dict(command="fromdata", source=Path( package_dir("sample", "_data", "inpatients.csv")), verbose=True, inline_limit=30, equal_weights=True, skip_columns=[], linked_columns=user_linked_cols) xA = tm.newExhibit(**args) xA.read_data() xA.generate_spec() # save the spec ID to delete temp tables after tests finish self._temp_tables.append(xA.spec_dict["metadata"]["id"]) self.assertEqual(len(xA.spec_dict["linked_columns"]), 1) self.assertListEqual(xA.spec_dict["linked_columns"][0][1], user_linked_cols)
def number_of_table_rows(table_name, column=None, db_uri=None): ''' Returns the number of rows in the given table Parameters ---------- table_name : str table in anon.db to query column : str optional. column name in the given table Returns ------- Count of rows ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" if "." in table_name: table_name, column = table_name.split(".") if column: sql = f"SELECT COUNT(DISTINCT {column}) FROM {table_name}" else: sql = f"SELECT COUNT() FROM {table_name}" conn = sqlite3.connect(db_uri, uri=True) #fetchall will return a list with the single tuple (result, ) with closing(conn): c = conn.cursor() c.execute(sql) result = c.fetchall()[0][0] return result
def test_spec_generation_with_predefined_linked_columns(self): ''' User defined linked columns are always saved as 0-th element in the linked columns list of the YAML specification. ''' user_linked_cols = ["sex", "age"] args = dict(command="fromdata", source=Path( package_dir("sample", "_data", "inpatients.csv")), verbose=True, inline_limit=30, equal_weights=True, skip_columns=[], linked_columns=user_linked_cols) xA = tm.newExhibit(**args) xA.read_data() xA.generate_spec() # save the spec ID to delete temp tables after tests finish self._temp_tables.append(xA.spec_dict["metadata"]["id"]) self.assertListEqual(xA.spec_dict["linked_columns"][0][1], user_linked_cols)
def test_connection_to_sqlite(self): ''' Check that the connect_to_anon returns a cursor object ''' db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) with closing(conn): assert isinstance(conn, sqlite3.Connection)
def test_read_data_func_reads_csv_from_source_path(self): ''' Send "mock" command line arguments to parse_args function and assert that the program reads the same data as ref_df. ''' args = dict(command="fromdata", source=Path( package_dir("sample", "_data", "inpatients.csv")), verbose=True, skip_columns=[]) xA = tm.newExhibit(**args) xA.read_data() assert isinstance(xA.df, pd.DataFrame)
def insert_table(file_path, table_name=None, db_uri=None): ''' Parse a .csv file and insert it into anon.db under its stem name Parameters ---------- file_path : string Any format that Pandas can read is potentially suitable, but only .csv is currently implemented table_name : string Optional parameter if you don't want to use filename's stem part as the table name Returns ------- No return; prints out confirmation if insertion is successful ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" if path_checker(file_path): if table_name is None: table_name = path_checker(file_path).stem #when creating a .csv from piping it from console on Windows, #encoding is changed from UTF-8 to ANSI try: table_df = pd.read_csv(file_path) except UnicodeDecodeError: table_df = pd.read_csv(file_path, encoding="ANSI") conn = sqlite3.connect(db_uri, uri=True) with closing(conn): table_df.to_sql( name=table_name, con=conn, if_exists="replace", index=False, ) print(f"Successfully inserted a new table {table_name}")
def test_less_than_two_predefined_linked_columns_raiser_error(self): ''' It only makes sense to have at least 2 linked columns ''' user_linked_cols = ["hb_name"] args = dict(command="fromdata", source=Path( package_dir("sample", "_data", "inpatients.csv")), verbose=True, inline_limit=30, equal_weights=True, skip_columns=[], linked_columns=user_linked_cols) self.assertRaises(Exception, tm.newExhibit, **args)
def table_info(table_name, db_uri=None): ''' Print out basic information about a given table Parameters ---------- table_name : string the name of a single table in the database db_uri : string or None added so that we can test the function using SQLite in-memory DB. Returns ------- Prints out the headers + all rows in the table. Values are comma separated to allow piping directly into a new .csv file ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) with closing(conn): c = conn.cursor() c.execute('SELECT name from sqlite_master where type= "table"') table_names = c.fetchall() if table_name in [tbl[0] for tbl in table_names]: c.execute(f"SELECT * FROM {table_name}") result = c.fetchall() c.execute(f"PRAGMA table_info({table_name})") headers = ",".join([x[1] for x in c.fetchall()]) print(headers) print(*[",".join([str(y) for y in x]) for x in result], sep="\n") else: print(f"{table_name} not in schema")
def drop_tables(table_names, db_uri=None): ''' Drop named table(s) from anon.db Parameters ---------- table_names : list of table names or regex strings Returns ------- Prints outcome (if successful) to console Note that in CLI, multiple table names must be separated with a space ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) if not isinstance(table_names, list): table_names = [table_names] with closing(conn): c = conn.cursor() c.execute('SELECT name from sqlite_master where type= "table"') source_tables = [tbl[0] for tbl in c.fetchall()] for table_name in table_names: for source_table in source_tables: if re.search(table_name, source_table): c.execute(f"DROP TABLE {source_table}") conn.execute("VACUUM") conn.commit() print(f"Successfully deleted table {source_table}")
def purge_temp_tables(db_uri=None): ''' Delete all tables with "temp_" prefix from anon.db Parameters ---------- db_uri : string or None added so that we can test the function using SQLite in-memory DB. Returns ------- Prints out confirmation with the number of columns dropped ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) with closing(conn): c = conn.cursor() c.execute('SELECT name from sqlite_master where type= "table"') table_names = c.fetchall() count = 0 for table in table_names: if "temp" in table[0]: c.execute(f"DROP TABLE {table[0]}") count += 1 conn.execute("VACUUM") conn.commit() print(f"Successfully deleted {count} tables")
def test_output_spec_respectes_equal_weights_argument(self): ''' Doc string ''' args = dict(command="fromdata", source=Path( package_dir("sample", "_data", "inpatients.csv")), verbose=True, inline_limit=30, equal_weights=True, skip_columns=[]) xA = tm.newExhibit(**args) xA.read_data() xA.generate_spec() expected = "10-19 | 0.100 | 0.100 | 0.100 | 0.100" result = xA.spec_dict["columns"]["age"]["original_values"][2] # save the spec ID to delete temp tables after tests finish self._temp_tables.append(xA.spec_dict["metadata"]["id"]) self.assertEqual(expected, result)
''' Module referencing sample data for export inpatients.csv is sourced from ISD Scotland Open Data page: http://www.isdscotland.org/Health-Topics/Hospital-Care/Inpatient-and-Day-Case-Activity/ prescribing.csv is sourced from NHS Scotland Open Data page: https://www.opendata.nhs.scot/dataset/prescriptions-in-the-community ''' # External imports import pandas as pd import yaml # Exhibit imports from exhibit.core.utils import package_dir #Load data inpatients_data = pd.read_csv(package_dir("sample", "_data", "inpatients.csv")) inpatients_anon = pd.read_csv(package_dir("sample", "_data", "inpatients_anon.csv"), parse_dates=["quarter_date"]) prescribing_data = pd.read_csv(package_dir("sample", "_data", "prescribing.csv"), parse_dates=["PaidDateMonth"]) #Load specs with open(package_dir("sample", "_spec", "inpatients_demo.yml")) as f: inpatients_spec = yaml.safe_load(f)
def create_temp_table(table_name, col_names, data, strip_whitespace=True, db_uri=None, return_table=False): ''' Create a lookup table in the anon.db SQLite3 database Parameters ---------- table_name : str make sure there are no spaces in the table_name as they are not allowed col_names: list or any other iterable column names also can't contain spaces data: list of tuples each tuple containting row's worth of data strip_whitespace : bool if the table is for user defined linked column, don't try to strip whitespace db_uri : str optional. During testing can pass an in-memory uri return_table : bool optional. Sometimes useful to return all values from the newly created table Occasionally it's useful to create a temporary table for linked columns that user doesn't want to anonymise, like Specialty and Specialty Group. To ensure that each Specialty has the correct Specialty Group, we can store this information in a temporary table in the anon.db The "1" in the "1-to-many" should always be the first column. Make sure you add "temp_" prefix to your table if you want it to be discovered by the automatic clean-up. Normally, you'd pass a list as col_names and data would be a list of tuples with length equal to the number of columns. On success returns True or fetches all records if return_table optional parameter is set to True. ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" #make sure data is stripped from extra whitespace to match the spec #as an extra precaution we're dropping any rows with np.NaN values in them if strip_whitespace: data = [tuple(y.strip() for y in x) for x in data if np.NaN not in x] if len(col_names) == 1: col_list = col_names[0] else: col_list = ", ".join(col_names) params = ", ".join(["?" for _ in col_names]) drop_sql = f"DROP TABLE IF EXISTS {table_name}" create_sql = f"CREATE TABLE {table_name} ({col_list})" insert_sql = f"INSERT INTO {table_name} VALUES ({params})" conn = sqlite3.connect(db_uri, uri=True) with closing(conn): c = conn.cursor() c.execute(drop_sql) c.execute(create_sql) c.executemany(insert_sql, data) conn.commit() if return_table: c.execute(f"SELECT * FROM {table_name}") return c.fetchall() return True
def query_anon_database(table_name, column=None, size=None, order="rowid", db_uri=None, exclude_missing=False): ''' Query anon.db and return a nice dataframe or series Parameters ---------- table_name : str table_name comes in a fixed format with temp_ prefix followed by the spec id and then either the linked group number of the column name in case of non-linked, many-valued columns column : str optional. Single column to be extracted from the given table size : int optional. The parameter to go into LIMIT statement order : str optional. The column to order the results by; defaults to rowid db_uri : str optional. For testing. exclude_missing : bool optional. Set to True to exclude the missing data placeholder value from the column, if SQL is for the single column only Returns ------- A dataframe with original column names ''' if db_uri is None: db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw" conn = sqlite3.connect(db_uri, uri=True) #column can come in as a string or as an empty list or as ["string"] if column and isinstance(column, list): column = column[0] #build the sql string: order_sql = f"ORDER BY {order}" size_sql = f"LIMIT {size}" if size else "" where_sql = (f"WHERE {column} != '{MISSING_DATA_STR}'" if (column and exclude_missing) else "") sql = f""" SELECT DISTINCT {str(column or '*')} FROM {table_name} {where_sql} {order_sql} {size_sql} """ with closing(conn): c = conn.cursor() c.execute(sql) column_names = [description[0] for description in c.description] result = c.fetchall() if len(column_names) == 1: output = pd.DataFrame(data={column_names[0]: [x[0] for x in result]}) output.rename(columns=lambda x: x.replace("$", " "), inplace=True) return output output = pd.DataFrame(data=result, columns=column_names) output.rename(columns=lambda x: x.replace("$", " "), inplace=True) return output