def query(self, q): _conn_args = { 's3_staging_dir': self.staging_dir, 'database': self.database } if self.access_key is not None and self.secret_key is not None: _conn_args['aws_access_key_id'] = self.access_key _conn_args['aws_secret_access_key'] = self.secret_key if self.region_name is not None: _conn_args['region_name'] = self.region_name conn = connect(**_conn_args) cursor = conn.cursor() cursor.execute(q) # Load query results into Pandas DataFrame and show results df = as_pandas(cursor) col_map = {} for col in df.columns: col_map[col] = col return df, col_map
def test_as_pandas_boolean_na_values(self, cursor): cursor.execute(""" SELECT * FROM boolean_na_values """) df = as_pandas(cursor) rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()] self.assertEqual(rows, [(True, False), (False, None), (None, None)])
def query_esbl_bacteria_label(observation_window_hours): query = """ select hadm_id, max(RESISTANT_BACT) resistant_label from ( WITH admissions AS ( SELECT admits.subject_id, admits.hadm_id, admits.admittime, admits.deathtime, microb.charttime, CASE WHEN admits.deathtime < microb.charttime THEN 1 ELSE 0 END AS death_before_rslt, date_diff('hour', admits.admittime, microb.charttime) AS time_to_rslt, CASE WHEN microb.interpretation in ('R','I') THEN 1 ELSE 0 END AS RESISTANT_YN, CASE WHEN microb.interpretation in ('R','I','S') THEN org_itemid ELSE 0 END AS RESISTANT_BACT, CASE WHEN microb.interpretation in ('S') THEN org_itemid ELSE 0 END AS SENSITIVE_BACT FROM mimiciii.admissions admits INNER JOIN mimiciii.microbiologyevents microb ON microb.hadm_id = admits.hadm_id WHERE ab_name in -- ('CEFTAZIDIME') ('CEFAZOLIN', 'CEFEPIME', 'CEFPODOXIME', 'CEFTAZIDIME', 'CEFTRIAXONE', 'CEFUROXIME') AND org_itemid in ( 80004, -- KLEBSIELLA PNEUMONIAE 80026, -- PSEUDOMONAS AERUGINOSA 80005, -- KLEBSIELLA OXYTOCA 80017, -- PROTEUS MIRABILIS 80040, -- NEISSERIA GONORRHOEAE 80008, -- ENTEROBACTER CLOACAE 80007, -- ENTEROBACTER AEROGENES 80002 -- ESCHERICHIA COLI ) ) SELECT admissions.subject_id, admissions.hadm_id, admissions.admittime, admissions.charttime, admissions.time_to_rslt, RESISTANT_YN, RESISTANT_BACT, SENSITIVE_BACT FROM admissions WHERE admissions.time_to_rslt is not null and admissions.time_to_rslt > %(time_window_hours)d ) a group by 1 """ params = { 'time_window_hours': observation_window_hours } cursor.execute(query, params) df = as_pandas(cursor) return df
def get_data_from_session(self, session_id: id): cursor = self.connection.cursor() query = f"select * from {self.TABLE_NAME} where sessionid = {session_id}" cursor.execute(query, cache_expiration_time=self.CACHE_EXPIRATION_SEC) df = as_pandas(cursor) df["time"] = df["time"] + pd.Timedelta(hours=self.UTC_OFFSET) return df
def query_all_pts_within_observation_window(observation_window_hours): """ Query to select all patients and link them to patient's admission time """ table_name = create_all_pts_within_observation_window(observation_window_hours) query = f"select * from {table_name}" cursor.execute(query) df = as_pandas(cursor) return df, table_name
def query_pts_multi_bacteria(observation_window_hours): """ Query to select multi bacteria microbiology tests and link them to patient's admission time """ query = """ WITH admissions AS ( SELECT admits.subject_id, admits.hadm_id, admits.admittime, admits.deathtime, microb.charttime, microb.org_itemid org_id, CASE WHEN admits.deathtime < microb.charttime THEN 1 ELSE 0 END AS death_before_rslt, date_diff('hour', admits.admittime, microb.charttime) AS time_to_rslt FROM mimiciii.admissions admits INNER JOIN mimiciii.microbiologyevents microb ON microb.hadm_id = admits.hadm_id WHERE microb.spec_itemid is not null and charttime is not null and ( org_itemid in ( 80293, -- positive for MRSA 80004, -- KLEBSIELLA PNEUMONIAE 80026, -- PSEUDOMONAS AERUGINOSA 80005, -- KLEBSIELLA OXYTOCA 80017, -- PROTEUS MIRABILIS 80040, -- NEISSERIA GONORRHOEAE 80008, -- ENTEROBACTER CLOACAE 80007, -- ENTEROBACTER AEROGENES 80002 ) or (spec_type_desc = 'MRSA SCREEN' and org_itemid is null) -- negative for mrsa ) ) SELECT admissions.subject_id, admissions.hadm_id, admissions.admittime, admissions.charttime, admissions.time_to_rslt, org_id FROM admissions WHERE admissions.time_to_rslt is not null and admissions.time_to_rslt > %(time_window_hours)d """ params = { 'time_window_hours': observation_window_hours } cursor.execute(query, params) df = as_pandas(cursor) return df
def test_as_pandas_integer_na_values(self, cursor): cursor.execute(""" SELECT * FROM integer_na_values """) df = as_pandas(cursor, coerce_float=True) rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()] # TODO AssertionError: Lists differ: # [(1.0, 2.0), (1.0, nan), (nan, nan)] != [(1.0, 2.0), (1.0, nan), (nan, nan)] # self.assertEqual(rows, [ # (1.0, 2.0), # (1.0, np.nan), # (np.nan, np.nan), # ]) np.testing.assert_array_equal(rows, [(1, 2), (1, np.nan), (np.nan, np.nan)])
def query_all_pts(observation_window_hours): """ load all patients that are possible for consideration and link them to patient's admission time """ query = """ WITH admissions AS ( SELECT admits.subject_id, admits.hadm_id, admits.admittime, admits.deathtime, (admits.admittime + interval %(time_window_hours)s hour + interval '1' hour) charttime, (admits.admittime + interval %(time_window_hours)s hour) as diff, CASE WHEN admits.deathtime < (admits.admittime + interval %(time_window_hours)s hour) THEN 1 ELSE 0 END AS death_before_rslt, 1 time_to_rslt, null org_itemid, null org_name, 0 RESISTANT_YN, 1 SENSITIVE_YN FROM mimiciii.admissions admits ) SELECT admissions.subject_id, admissions.hadm_id, admissions.admittime, admissions.charttime, admissions.diff, admissions.time_to_rslt, RESISTANT_YN, SENSITIVE_YN FROM admissions WHERE admissions.death_before_rslt != 1 order by random() limit 10000 """ params = { 'time_window_hours': str(observation_window_hours) } cursor.execute(query, params) df = as_pandas(cursor) return df
def get_sessions(self): cursor = self.connection.cursor() query = f""" select sessionid, min(time) as startTime, max(time) as endTime, date_diff('minute', min(time), max(time)) as durationMin from {self.TABLE_NAME} group by sessionid order by startTime desc """ cursor.execute(query, cache_expiration_time=self.CACHE_EXPIRATION_SEC) df = as_pandas(cursor) df["startTime"] = df["startTime"] + pd.Timedelta(hours=self.UTC_OFFSET) df["endTime"] = df["endTime"] + pd.Timedelta(hours=self.UTC_OFFSET) return df
def test_as_pandas(self, cursor): cursor.execute(""" SELECT col_boolean , col_tinyint , col_smallint , col_int , col_bigint , col_float , col_double , col_string , col_timestamp , CAST(col_timestamp AS time) AS col_time , col_date , col_binary , col_array , CAST(col_array AS json) AS col_array_json , col_map , CAST(col_map AS json) AS col_map_json , col_struct , col_decimal FROM one_row_complex """) df = as_pandas(cursor) rows = [ tuple([ row["col_boolean"], row["col_tinyint"], row["col_smallint"], row["col_int"], row["col_bigint"], row["col_float"], row["col_double"], row["col_string"], row["col_timestamp"], row["col_time"], row["col_date"], row["col_binary"], row["col_array"], row["col_array_json"], row["col_map"], row["col_map_json"], row["col_struct"], row["col_decimal"], ]) for _, row in df.iterrows() ] expected = [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", datetime(2017, 1, 1, 0, 0, 0), datetime(2017, 1, 1, 0, 0, 0).time(), date(2017, 1, 2), b"123", "[1, 2]", [1, 2], "{1=2, 3=4}", { "1": 2, "3": 4 }, "{a=1, b=2}", Decimal("0.1"), )] self.assertEqual(rows, expected)