示例#1
0
    def query(self, q):
        _conn_args = {
            's3_staging_dir': self.staging_dir,
            'database': self.database
        }

        if self.access_key is not None and self.secret_key is not None:
            _conn_args['aws_access_key_id'] = self.access_key
            _conn_args['aws_secret_access_key'] = self.secret_key

        if self.region_name is not None:
            _conn_args['region_name'] = self.region_name

        conn = connect(**_conn_args)
        cursor = conn.cursor()

        cursor.execute(q)

        # Load query results into Pandas DataFrame and show results
        df = as_pandas(cursor)

        col_map = {}
        for col in df.columns:
            col_map[col] = col

        return df, col_map
示例#2
0
 def test_as_pandas_boolean_na_values(self, cursor):
     cursor.execute("""
         SELECT * FROM boolean_na_values
         """)
     df = as_pandas(cursor)
     rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()]
     self.assertEqual(rows, [(True, False), (False, None), (None, None)])
def query_esbl_bacteria_label(observation_window_hours):
    query = """
    select hadm_id, max(RESISTANT_BACT) resistant_label from (
    WITH admissions AS (
    SELECT
        admits.subject_id,
        admits.hadm_id,
        admits.admittime,
        admits.deathtime, 
        microb.charttime,
        CASE WHEN admits.deathtime < microb.charttime THEN 1 ELSE 0 END AS death_before_rslt,
        date_diff('hour', admits.admittime, microb.charttime) AS time_to_rslt,
        CASE WHEN microb.interpretation in ('R','I') THEN 1 ELSE 0 END AS RESISTANT_YN,
        CASE WHEN microb.interpretation in ('R','I','S') THEN org_itemid ELSE 0 END AS RESISTANT_BACT,
        CASE WHEN microb.interpretation in ('S') THEN org_itemid ELSE 0 END AS SENSITIVE_BACT
    
    FROM mimiciii.admissions admits
    INNER JOIN mimiciii.microbiologyevents microb
        ON microb.hadm_id = admits.hadm_id 
    WHERE ab_name in 
        -- ('CEFTAZIDIME') 
        ('CEFAZOLIN', 'CEFEPIME', 'CEFPODOXIME', 'CEFTAZIDIME', 'CEFTRIAXONE', 'CEFUROXIME')
        AND
          org_itemid in (
                    80004, -- KLEBSIELLA PNEUMONIAE
                    80026, -- PSEUDOMONAS AERUGINOSA
                    80005, -- KLEBSIELLA OXYTOCA
                    80017, -- PROTEUS MIRABILIS
                    80040, -- NEISSERIA GONORRHOEAE
                    80008, -- ENTEROBACTER CLOACAE
                    80007, -- ENTEROBACTER AEROGENES
                    80002 -- ESCHERICHIA COLI
                    )
        )
    SELECT         
        admissions.subject_id,
        admissions.hadm_id,
        admissions.admittime,
        admissions.charttime,
        admissions.time_to_rslt,
        RESISTANT_YN,
        RESISTANT_BACT,
        SENSITIVE_BACT
    FROM admissions
    WHERE admissions.time_to_rslt is not null
          and admissions.time_to_rslt > %(time_window_hours)d
    
    ) a
    group by 1
    """
    params = {
        'time_window_hours': observation_window_hours
    }
    cursor.execute(query, params)
    df = as_pandas(cursor)

    return df
示例#4
0
    def get_data_from_session(self, session_id: id):
        cursor = self.connection.cursor()
        query = f"select * from {self.TABLE_NAME} where sessionid = {session_id}"

        cursor.execute(query, cache_expiration_time=self.CACHE_EXPIRATION_SEC)

        df = as_pandas(cursor)
        df["time"] = df["time"] + pd.Timedelta(hours=self.UTC_OFFSET)
        return df
def query_all_pts_within_observation_window(observation_window_hours):
    """
    Query to select all patients
    and link them to patient's admission time
    """
    table_name = create_all_pts_within_observation_window(observation_window_hours)
    query = f"select * from {table_name}"
    cursor.execute(query)
    df = as_pandas(cursor)
    return df, table_name
def query_pts_multi_bacteria(observation_window_hours):
    """
    Query to select multi bacteria microbiology tests
    and link them to patient's admission time
    """
    query = """
    WITH admissions AS (
    SELECT
        admits.subject_id,
        admits.hadm_id,
        admits.admittime,
        admits.deathtime, 
        microb.charttime,
        microb.org_itemid org_id,
        CASE WHEN admits.deathtime < microb.charttime THEN 1 ELSE 0 END AS death_before_rslt,
        date_diff('hour', admits.admittime, microb.charttime) AS time_to_rslt
    FROM mimiciii.admissions admits
    INNER JOIN mimiciii.microbiologyevents microb
        ON microb.hadm_id = admits.hadm_id 
    WHERE microb.spec_itemid is not null
            and charttime is not null
            and (
                     org_itemid in (
                    80293, -- positive for MRSA
                    80004, -- KLEBSIELLA PNEUMONIAE
                    80026, -- PSEUDOMONAS AERUGINOSA
                    80005, -- KLEBSIELLA OXYTOCA
                    80017, -- PROTEUS MIRABILIS
                    80040, -- NEISSERIA GONORRHOEAE
                    80008, -- ENTEROBACTER CLOACAE
                    80007, -- ENTEROBACTER AEROGENES
                    80002
                    )
                    or (spec_type_desc = 'MRSA SCREEN' and org_itemid is null) -- negative for mrsa
                 )
      )
    SELECT         
        admissions.subject_id,
        admissions.hadm_id,
        admissions.admittime,
        admissions.charttime,
        admissions.time_to_rslt,
        org_id
    FROM admissions
    WHERE admissions.time_to_rslt is not null
          and admissions.time_to_rslt > %(time_window_hours)d

"""
    params = {
        'time_window_hours': observation_window_hours
    }
    cursor.execute(query, params)
    df = as_pandas(cursor)

    return df
示例#7
0
 def test_as_pandas_integer_na_values(self, cursor):
     cursor.execute("""
         SELECT * FROM integer_na_values
         """)
     df = as_pandas(cursor, coerce_float=True)
     rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()]
     # TODO AssertionError: Lists differ:
     #  [(1.0, 2.0), (1.0, nan), (nan, nan)] != [(1.0, 2.0), (1.0, nan), (nan, nan)]
     # self.assertEqual(rows, [
     #     (1.0, 2.0),
     #     (1.0, np.nan),
     #     (np.nan, np.nan),
     # ])
     np.testing.assert_array_equal(rows, [(1, 2), (1, np.nan),
                                          (np.nan, np.nan)])
def query_all_pts(observation_window_hours):
    """
    load all patients that are possible for consideration
    
    and link them to patient's admission time
    """
    query = """
    WITH admissions AS (
    SELECT
        admits.subject_id,
        admits.hadm_id,
        admits.admittime,
        admits.deathtime, 
        (admits.admittime + interval %(time_window_hours)s hour + interval '1' hour) charttime,
        (admits.admittime + interval %(time_window_hours)s hour) as diff,
        CASE WHEN admits.deathtime < (admits.admittime + interval %(time_window_hours)s hour) THEN 1 ELSE 0 END AS death_before_rslt,
        1 time_to_rslt,
        null org_itemid, 
        null org_name,
        0 RESISTANT_YN,
        1 SENSITIVE_YN
    FROM mimiciii.admissions admits
    )
    SELECT         
        admissions.subject_id,
        admissions.hadm_id,
        admissions.admittime,
        admissions.charttime,
        admissions.diff,
        admissions.time_to_rslt,
        RESISTANT_YN,
        SENSITIVE_YN
    FROM admissions
    WHERE 
          admissions.death_before_rslt != 1 
    order by random()
    limit 10000
    
    """
    params = {
        'time_window_hours': str(observation_window_hours)
    }
    cursor.execute(query, params)
    df = as_pandas(cursor)

    return df
示例#9
0
    def get_sessions(self):
        cursor = self.connection.cursor()

        query = f"""
        select sessionid,
        min(time) as startTime,
        max(time) as endTime,
        date_diff('minute', min(time), max(time)) as durationMin
        from {self.TABLE_NAME} group by sessionid
        order by startTime desc
        """

        cursor.execute(query, cache_expiration_time=self.CACHE_EXPIRATION_SEC)

        df = as_pandas(cursor)
        df["startTime"] = df["startTime"] + pd.Timedelta(hours=self.UTC_OFFSET)
        df["endTime"] = df["endTime"] + pd.Timedelta(hours=self.UTC_OFFSET)
        return df
示例#10
0
 def test_as_pandas(self, cursor):
     cursor.execute("""
         SELECT
           col_boolean
           , col_tinyint
           , col_smallint
           , col_int
           , col_bigint
           , col_float
           , col_double
           , col_string
           , col_timestamp
           , CAST(col_timestamp AS time) AS col_time
           , col_date
           , col_binary
           , col_array
           , CAST(col_array AS json) AS col_array_json
           , col_map
           , CAST(col_map AS json) AS col_map_json
           , col_struct
           , col_decimal
         FROM one_row_complex
         """)
     df = as_pandas(cursor)
     rows = [
         tuple([
             row["col_boolean"],
             row["col_tinyint"],
             row["col_smallint"],
             row["col_int"],
             row["col_bigint"],
             row["col_float"],
             row["col_double"],
             row["col_string"],
             row["col_timestamp"],
             row["col_time"],
             row["col_date"],
             row["col_binary"],
             row["col_array"],
             row["col_array_json"],
             row["col_map"],
             row["col_map_json"],
             row["col_struct"],
             row["col_decimal"],
         ]) for _, row in df.iterrows()
     ]
     expected = [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         datetime(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 1, 0, 0, 0).time(),
         date(2017, 1, 2),
         b"123",
         "[1, 2]",
         [1, 2],
         "{1=2, 3=4}",
         {
             "1": 2,
             "3": 4
         },
         "{a=1, b=2}",
         Decimal("0.1"),
     )]
     self.assertEqual(rows, expected)