Exemplo n.º 1
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id, common_name, ord_num_value, result_flag_name
              FROM {}.bayes_m_vw_account_lab_results
        """.format(self._schema)
        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        tests = res.pivot(index='hsp_acct_study_id',
                          columns='common_name',
                          values='ord_num_value')

        # Start with the Tabak features.
        scores = calculate_tabak_mortality_features(tests)

        # Boolean features for HOSPITAL score.
        scores['hosp_low_hemoglobin'] = tests['HEMOGLOBIN'] < 12
        scores['hosp_low_sodium'] = tests['SODIUM'] < 135

        # Result of most recent cocaine test (probably not as useful as "history of cocaine usage").
        scores['if_cocaine_bool'] = ~np.isnan(tests['COCAINE'])

        # Simple statistics on abnormal test results.
        scores['num_total_results'] = res.groupby(
            'hsp_acct_study_id').common_name.count()
        scores['num_abnormal_results'] = \
            res[res.result_flag_name != ""].groupby("hsp_acct_study_id").result_flag_name.count()
        scores['pct_abnormal_results'] = \
            100.0 * scores['num_abnormal_results'] / scores['num_total_results']

        scores.fillna(0, inplace=True)

        return self.emit_df(scores)
Exemplo n.º 2
0
    def extract(self):
        query = """
            SELECT *
              FROM {}.bayes_m_vw_feature_comorbidities
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        pivoted = pd.pivot_table(data=res,
                                 index='hsp_acct_study_id',
                                 values='weight', columns='condition_cat')
        log.info('The pivoted table has %d rows.' % len(pivoted))
        pivoted.columns = map(lambda c: 'comor_' + c.lower(), pivoted.columns)

        df = pd.DataFrame(index=res.hsp_acct_study_id.unique())
        df[pivoted.columns] = pivoted
        df['charlson_index'] = df.apply(find_cci, axis=1)
        df['charlson_index_lace'] = df.charlson_index.apply(lambda s: s if s <= 3 else 5)

        # I needed weight values to calculate cci. Now, I will replace all
        # non-null weights with 1 and all null values with 0 to have a boolean value
        # for each comorbid condition.
        df.fillna(0, inplace=True)
        for col in df.columns[:-2]:
            df[col] = df[col].astype('bool')

        return self.emit_df(df)
Exemplo n.º 3
0
def create_materialized_views(schema):
    """
    Create materialized views in the database from all views in the `views/materialized` folder.

    If a materialized view already exists with a given name, we simply skip creating it.
    But note that the existing view could be different from what's currently in the file
    (in this case, you should delete the view in the database before running this method).

    Also, this method doesn't handle refreshing existing materialized views in the database.
    If you want to refresh a materialized view, do it manually.
    """
    engine = postgres.get_connection()
    files = glob.glob(os.path.join('views/materialized', '*.sql'))
    for filename in files:
        view_name = _filename_to_viewname(filename, prefix="bayes_m_")
        content = _read_view_file(filename, schema)

        if engine.has_table(view_name, schema):
            log.info("Materialized view {}.{} already exists".format(schema, view_name))
            log.info("(If you want to update it, you must do so manually!)")
        else:
            log.info("Creating materialized view {}.{} ...".format(schema, view_name))
            start_time = time.time()  # Let's time it, because materialized views can take a while!
            engine.execute(CREATE_MATERIALIZED_VIEW_STR.format(schema, view_name, content))
            end_time = time.time()
            log.info("... success! (took %.2f sec to render)" % (end_time - start_time))
Exemplo n.º 4
0
    def extract(self):
        query = """
            SELECT *
              FROM {}.bayes_vw_feature_procedures
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        df = pd.DataFrame()
        df['num_px'] = res.groupby(
            'hsp_acct_study_id').ccs_category_description.count()
        df.fillna(0, inplace=True)

        categories = res.dropna() \
                        .groupby('hsp_acct_study_id') \
                        .ccs_category_description \
                        .apply(lambda px: "|".join('px_' + p for p in px)) \
                        .str.get_dummies()
        df = pd.concat([df, categories.astype('bool')], axis=1)
        df.fillna(False, inplace=True)

        return self.emit_df(df)
Exemplo n.º 5
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id, age,
                    if_female_bool,
                    race_name, if_hispanic_bool,
                   marital_status_name, if_intrptr_needed_bool
              FROM {}.bayes_vw_feature_demographics
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        # Occasionally (in less than 5% of cases), we have more than 1 row per patient.
        # I randomly select one line here.
        res = res.groupby('hsp_acct_study_id').first()

        res['age^2'] = res.age.apply(lambda x: x**2)
        res['age^3'] = res.age.apply(lambda x: x**3)
        res['tabak_age'] = res.age.apply(lambda x: max(0, 0.4 * (x - 45)))
        res['race_cat'] = res.race_name.apply(race_from_string)
        res['marital_status_cat'] = res.marital_status_name.apply(
            marital_status_from_string)

        res.drop(['race_name', 'marital_status_name'], 1, inplace=True)

        return self.emit_df(res)
Exemplo n.º 6
0
def update_views(schema):
    """
    Update the views in the database from all views in the `views` folder.

    We simply iterate over all the files in the folder, drop views that might
    already exist and re-create them again with from new code.
    """
    engine = postgres.get_connection()
    files = glob.glob(os.path.join('views', '*.sql'))
    for filename in files:
        view_name = _filename_to_viewname(filename)
        content = _read_view_file(filename, schema)
        # Queries that use the % character somehow get misinterpreted as formatting characters
        # when passed to engine.execute. So we have to "escape" them here.
        content = content.replace('%', '%%')

        try:
            # Best-case scenario: if the view is "similar" enough to what is currently
            # in the db, we can REPLACE} it in-place to avoid having to DROP CASCADE.
            log.info("Attempting to replace {}.{} ...".format(schema, view_name))
            engine.execute(CREATE_OR_REPLACE_STR.format(schema, view_name, content))
            log.info("... success!")
        except:
            # Worst-case scenario: if the view has changed significantly from what is
            # currently in the db, we have to DROP CASCADE. The sad part is that this
            # will drop any materialized views that depend on this view!
            log.info("couldn't replace it in-place. dropping and recreating it instead ...")
            engine.execute(DROP_IF_EXISTS_STR.format(schema, view_name))
            engine.execute(CREATE_VIEW_STR.format(schema, view_name, content))
Exemplo n.º 7
0
    def extract(self):
        query = """
          SELECT
            *
            FROM {}.bayes_vw_feature_utilization
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The pre-pivot table has %d rows.' % len(res))

        pivoted = pd.pivot_table(
            data=res,
            index='hsp_acct_study_id',
            columns='pre_adm_type',
            aggfunc=sum,
            dropna=True,
            fill_value=0,
            values=['pre_3_month', 'pre_6_month', 'pre_12_month'])

        df_columns = [
            top + "_" + bottom.lower()
            for top, bottom in pivoted.columns.values
        ]
        df = pd.DataFrame(index=res.hsp_acct_study_id.unique())
        df[df_columns] = pivoted
        df.fillna(0, inplace=True)
        df['er_visits_lace'] = df['pre_6_month_emergency'].apply(
            lambda cnt: min(cnt, 4))

        return self.emit_df(df)
Exemplo n.º 8
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id, tobacco_cat, alcohol_cat, drugs_cat
              FROM {}.bayes_m_vw_feature_health_history
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine, index_col="hsp_acct_study_id")
        res.fillna('na', inplace=True)

        return self.emit_df(res)
Exemplo n.º 9
0
    def extract(self):
        query = """
                SELECT
                    *
                FROM {}.bayes_vw_feature_payer
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine, index_col='hsp_acct_study_id')
        # Les than 5 duplicates across all hospitals. Will drop them
        res = res.groupby(res.index).first()
        return self.emit_df(res)
Exemplo n.º 10
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id,
                   disch_weekday_cat, disch_day_of_month_cat, disch_time_cat,
                   disch_location_cat, length_of_stay, length_of_stay_lace
              FROM {}.bayes_vw_feature_discharge
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        # There are two duplicates which I'm going to ignore for now.
        res.drop_duplicates(subset='hsp_acct_study_id', inplace=True)
        res.set_index('hsp_acct_study_id', inplace=True)

        return self.emit_df(res)
Exemplo n.º 11
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id,
                   admission_source_cat,
                   admission_type_cat,
                   admission_weekday_cat,
                   admission_time_cat,
                   acuity_lace,
                   hospital_name_cat
              FROM {}.bayes_vw_feature_admission
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine, index_col="hsp_acct_study_id")
        # Les than 5 duplicates across all hospitals. Will drop them
        res = res.groupby(res.index).first()
        return self.emit_df(res)
Exemplo n.º 12
0
    def extract(self):
        query = """
            SELECT hsp_acct_study_id,
                   temperature, pulse, respirations, bp_systolic, bp_diastolic,
                   height, weight, bmi
              FROM {}.bayes_m_vw_feature_vitals
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine, index_col='hsp_acct_study_id')

        res['height_in_inches'] = res.height.apply(_height_to_inches)
        res['weight_in_lb'] = res.weight / 16

        res.drop(['height', 'weight'], axis=1,
                 inplace=True)  # remove unwanted columns
        res.fillna(res.median(), inplace=True)

        return self.emit_df(res)
Exemplo n.º 13
0
    def extract(self):
        query = """
            SELECT *
              FROM {}.bayes_vw_feature_provider
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        pivoted = res.dropna() \
                     .groupby('hsp_acct_study_id') \
                     .specialty \
                     .apply(_rename_columns) \
                     .str.get_dummies()

        df = pd.DataFrame(index=res.hsp_acct_study_id.unique())
        df[pivoted.columns] = pivoted.astype('bool')
        df.fillna(False, inplace=True)

        return self.emit_df(df)
Exemplo n.º 14
0
    def extract(self):
        query = """
            SELECT *
              FROM {}.bayes_vw_feature_encounter_reason
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        dummified = res.dropna() \
                       .groupby('hsp_acct_study_id') \
                       .enc_reason_name_cat \
                       .apply(lambda reasons: "|".join([format_column_title(r) for r in reasons])) \
                       .str.get_dummies()

        df = pd.DataFrame(index=res.hsp_acct_study_id.unique())
        df_columns = dummified.columns
        df[df_columns] = dummified
        df.fillna(0, inplace=True)
        df = df.astype('bool')
        return self.emit_df(df)
Exemplo n.º 15
0
    def extract(self):
        query = """
            SELECT *
              FROM {}.bayes_vw_feature_hospital_problems
        """.format(self._schema)

        engine = postgres.get_connection()

        res = pd.read_sql(query, engine)
        log.info('The queried table has %d rows.' % len(res))

        pivoted = pd.pivot_table(res,
                                 index='hsp_acct_study_id',
                                 columns='ccs_category_description',
                                 aggfunc=len)
        pivoted.columns = map(lambda c: 'hcup_category_' + c, pivoted.columns)

        df = pd.DataFrame(index=res.hsp_acct_study_id.unique())
        df[pivoted.columns] = pivoted
        df.fillna(0, inplace=True)
        df = df.astype('bool')

        df['hospital_problems_count'] = df.apply(sum, axis=1)
        return self.emit_df(df)