Пример #1
0
def main():
    """
    Quarterly unemployment rate in state of Rhode Island taken from Bureau of Labor and Statistics
    """
    index = ["RIIPL_ID"]

    unemp = pd.read_csv(unemp_file)
    unemp["YRMO"] = unemp.YRMO.astype(str)

    sql = """
          SELECT pop.riipl_id,
                 lb.yrmo
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
           WHERE lb.timestep = {nsteps} - 1
        ORDER BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection)

    features = features.merge(unemp, how="left",
                              on="YRMO")[["RIIPL_ID", "UNEMP_RATE"]]
    features = features.set_index("RIIPL_ID").rename(
        columns={"UNEMP_RATE": "UNEMP_RI"})

    labels = {"UNEMP_RI": "Rhode Island monthly unemployment rate"}

    SaveFeatures(features, out, manifest, population, labels)
Пример #2
0
def main():
    """
    Indicators for DOC sentences by crime type
    """

    sql = """
          SELECT DISTINCT offense_type AS offense
            FROM {doc_sentences}
           WHERE offense_type IS NOT NULL
        ORDER BY offense
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        offenses = pd.read_sql(sql, cxn)

    print(offenses)
    pivot = ",".join([
        "'{0}' AS doc_sentenced_{0}".format(offense)
        for offense in offenses["OFFENSE"]
    ])

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT * 
            FROM (
                  SELECT pop.riipl_id,
                         ds.offense_type AS offense,
                         1 AS sentenced
                    FROM {population} pop
               LEFT JOIN {lookback} lb
                      ON pop.riipl_id = lb.riipl_id
               LEFT JOIN {dim_date} dd
                      ON lb.yrmo = dd.yrmo
              INNER JOIN {doc_sentences} ds
                      ON pop.riipl_id = ds.riipl_id AND 
                         dd.date_dt = ds.imposed_date
                GROUP BY pop.riipl_id, ds.offense_type
                 ) 
           PIVOT (MAX(sentenced) FOR offense IN ({pivot}))
          """.format(pivot=pivot, **globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Drop offense types that are missing for the entire population.
    features = features.dropna(axis=1, how="all").fillna(0).astype(int)

    labels = {}
    for offense in offenses["OFFENSE"]:
        label = "sentenced for {} during lookback period".format(offense)
        labels["DOC_SENTENCED_{}".format(offense.upper())] = label

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))
Пример #3
0
def main():
    """
    Indicator for UI exhaustion
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 1 AS ui_exhaustion
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {dlt_ui_payments} du
              ON pop.riipl_id = du.riipl_id AND
                 dd.date_dt = du.trans_date
          WHERE  void_check_flag IS NULL AND
                 delete_flag IS NULL AND
                 opening_credits > 0 AND
                 closing_credits <= 0
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection, index_col=index)).fillna(0)

    labels = {"UI_EXHAUSTION": "Exhausted UI benefits"}

    SaveFeatures(features, out, manifest, population, labels)
Пример #4
0
def main():
    """
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,   
                 SUM(cc.car_crash) AS car_crashes,
                 SUM(cc.injured)   AS car_crash_injuries
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {car_crashes} cc
              ON pop.riipl_id = cc.riipl_id AND 
                 cc.yrmo = lb.yrmo
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "CAR_CRASHES":
        "count of car crashes involved in during lookback period",
        "CAR_CRASH_INJURIES":
        "count of car crash injuries during lookback period",
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #5
0
def main():
    sql = """
          SELECT pop.riipl_id,
                 age,
                 race,
                 sex,
                 marital_status,
                 bmi
            FROM {population} pop
       LEFT JOIN {dim_date} dd
              ON pop.initial_dt = dd.date_dt
       LEFT JOIN {mega} mega
              ON pop.riipl_id = mega.riipl_id AND
                 dd.yrmo = mega.month
        ORDER BY pop.riipl_id
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = pd.read_sql(sql, cxn, index_col="RIIPL_ID")

    labels = {
        "AGE": "Age at first prescription",
        "RACE": "Race (Asian, Black, Hispanic, Native American, Other, White)",
        "SEX": "Sex at first prescription",
        "MARITAL_STATUS": "Marital status at first prescription",
        "BMI": "Body Mass Index at first prescription"
    }

    SaveFeatures(features, outfile, manifest, population, labels)
Пример #6
0
def main():
    """
    Home addresses from MEGA table
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 address_trct
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {mega} m 
              ON pop.riipl_id = m.riipl_id AND
                 lb.yrmo = m.month
           WHERE m.address_trct IS NOT NULL
         """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        values = pd.read_sql(sql, cxn).set_index(index)

    labels = {}
    for trct in values.ADDRESS_TRCT.unique():
        name = "TRCT_{}".format(trct)
        feature = values[values["ADDRESS_TRCT"] == trct]
        feature[name] = 1
        features = features.join(feature[name])
        labels[name] = "lived in Census tract {} during lookback period".format(trct)

    features = features.fillna(0)

    SaveFeatures(features, out, manifest, population, labels, bool_features=list(labels))
Пример #7
0
def main():
    """
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 SUM(citations) AS citations,
                 SUM(fines)     AS citation_fines
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {citations} c
              ON pop.riipl_id = c.riipl_id AND 
                 c.yrmo = lb.yrmo
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "CITATIONS": "Number of police citations",
        "CITATION_FINES": "Total fines from police citations"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #8
0
def main():
    """
    Indicators for commited, released, or charged from DOC events
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 MAX(CASE WHEN de.event_type = 'Committed' THEN 1 ELSE 0 END) AS doc_commited,
                 MAX(CASE WHEN de.event_type = 'Released'  THEN 1 ELSE 0 END) AS doc_released,
                 MAX(CASE WHEN de.event_type = 'Charged'   THEN 1 ELSE 0 END) AS doc_charged
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
       LEFT JOIN {doc_events} de
              ON pop.riipl_id = de.riipl_id AND 
                 dd.date_dt = de.event_date
        GROUP BY pop.riipl_id
        """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    labels = {
        "DOC_COMMITED" : "Committed to a corrections facility",
        "DOC_RELEASED" : "Released from a corrections facility",
        "DOC_CHARGED"  : "Charged with a crime"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #9
0
def main():

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    # Claims payments

    sql = """
          SELECT pop.riipl_id,
                 SUM(mc.pay_amt) AS medicaid_payments,
                 SUM(CASE WHEN mc.src_id IN (4, 8, 16, 17, 26, 27) THEN mc.pay_amt ELSE 0 END) AS medicaid_prof_payments,
                 SUM(CASE WHEN mc.ed_flag = 1 THEN mc.pay_amt ELSE 0 END) AS medicaid_ed_payments
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_claims} mc
              ON pop.riipl_id = mc.riipl_id AND
                 dd.date_dt = mc.claim_dt
        GROUP BY pop.riipl_id
      """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Pharmacy payments

    sql = """
          SELECT pop.riipl_id,
                 SUM(mp.pay_amt) AS medicaid_rx_payments
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_pharmacy} mp
              ON pop.riipl_id = mp.riipl_id AND
                 dd.date_dt = mp.dispensed_dt
        GROUP BY pop.riipl_id
      """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Payments are comprehensive, so a missing value means $0
    features = features.fillna(0)

    labels = {
        "MEDICAID_PAYMENTS"      : "Total Medicaid payments",
        "MEDICAID_PROF_PAYMENTS" : "Total Medicaid payments to professionals",
        "MEDICAID_ED_PAYMENTS"   : "Total Medicaid payments to Emergency Departments",
        "MEDICAID_RX_PAYMENTS"   : "Total Medicaid pharmacy payments"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #10
0
def main():
    """
    Nationwide unemployment rate by NAICS code at a yearly level from Bureau of Labor and Statistics
    """
    index = ["RIIPL_ID"]

    unemp = pd.read_csv(unemp_file)
    unemp["NAICS"] = unemp.NAICS.astype(str)

    # Calculate average unemployment by year
    unemp_avg = unemp.groupby("YR").mean().reset_index()
    print(unemp_avg)

    # Lookup all wage records in the lookback period, and order by
    # wages, to select the NAICS for the highest wage.
    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 TO_CHAR(pop.initial_dt, 'YYYY') AS yr,
                 dw.wages,
                 SUBSTR(dw.naics4, 1, 2) AS naics
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dlt_wage} dw
              ON pop.riipl_id = dw.riipl_id AND
                 lb.yyq = dw.yyq
        ORDER BY pop.riipl_id, dw.wages
           """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection).drop_duplicates(
            index, keep="last").set_index(index)

    # Use the previous year for joining the unemployment rate
    features["YR"] = features.YR.astype(int) - 1

    # Join NAICS unemployment, using avg unemployment when no NAICS is available
    naics = features.merge(unemp, how="left", on=["YR",
                                                  "NAICS"]).NAICS_UNEMP_RATE

    avg = features.merge(unemp_avg, how="left", on="YR").NAICS_UNEMP_RATE

    features["UNEMP_NAICS"] = np.where(naics.notnull(), naics, avg)

    labels = {
        "UNEMP_NAICS":
        "unemployment rate by NAICS code at national level in year before index date"
    }

    SaveFeatures(features[["UNEMP_NAICS"]], out, manifest, population, labels)
Пример #11
0
def main():

    icd9 = pd.concat([
        pd.read_csv(icd9_file1, usecols=[0, 1], names=["DIAG_CDE", "DESC"]),
        pd.read_csv(icd9_file2)
    ]).drop_duplicates("DIAG_CDE").set_index("DIAG_CDE")

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 mdc.diag_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_diag_cde} mdc
              ON pop.riipl_id = mdc.riipl_id AND
                 dd.date_dt = mdc.claim_dt
      INNER JOIN {diag_corr} dc
              ON mdc.diag_cde = dc.diag_cde AND
                 ABS(dc.corr) > 0
        GROUP BY pop.riipl_id, mdc.diag_cde
              """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    cdes = values.DIAG_CDE.unique()

    labels = {}
    for cde in cdes:
        feature = values.loc[values.DIAG_CDE == cde, "N"]
        feature.name = "DIAG_{}".format(cde)
        features = features.join(feature)
        try:
            desc = icd9.loc[cde, "DESC"]
        except KeyError:
            desc = "unknown description"
        label = "number of diagnoses for '{}' during lookback period".format(
            desc)
        labels[feature.name] = label

    features = features.fillna(0)

    SaveFeatures(features, out, manifest, population, labels)
Пример #12
0
def main():

    procs = pd.read_csv(procsfile, sep="|", index_col="PROC_CDE")

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 mpc.proc_cde,
                 COUNT(*) AS n
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {medicaid_proc_cde} mpc
              ON pop.riipl_id = mpc.riipl_id AND
                 dd.date_dt = mpc.claim_dt
      INNER JOIN {proc_corr} pc
              ON mpc.proc_cde = pc.proc_cde AND
                 ABS(pc.corr) > 0
        GROUP BY pop.riipl_id, mpc.proc_cde
          """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    cdes = values.PROC_CDE.unique()

    labels = {}
    for cde in cdes:
        feature = values.loc[values.PROC_CDE == cde, "N"]
        feature.name = "PROC_{}".format(cde)
        features = features.join(feature)
        try:
            desc = procs.loc[cde, "DESCRIPTION"]
        except KeyError:
            desc = "unknown procedure"
        label = "number of procedures for '{}' during lookback period".format(
            desc)
        labels[feature.name] = label

    features = features.fillna(0)

    SaveFeatures(features, out, manifest, population, labels)
Пример #13
0
def main():
    """
    Opioid prescriptions and adverse outcomes among DHS household
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 SUM(ABS(no.opioid))     AS dhs_hh_opioid_rx,
                 SUM(no.recovery)        AS dhs_hh_recovery_rx,
                 COUNT(o.outcome_any_dt) AS dhs_hh_outcomes
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
       LEFT JOIN {household} hh
              ON pop.riipl_id = hh.riipl_id
       LEFT JOIN {medicaid_pharmacy} mp
              ON hh.riipl_id_hh = mp.riipl_id AND
                 dd.date_dt = mp.dispensed_dt
       LEFT JOIN {ndc_opioids} no
              ON mp.ndc9_code = no.ndc
       LEFT JOIN {outcomes} o
              ON hh.riipl_id_hh = o.riipl_id AND
                 dd.date_dt = o.outcome_any_dt
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(
            pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "DHS_HH_OPIOID_RX": "Number of opioid prescriptions in household",
        "DHS_HH_RECOVERY_RX": "Number of recovery prescriptions in household",
        "DHS_HH_OUTCOMES": "Number of adverse outcomes in household"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #14
0
def main():

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 MAX(CASE WHEN rd.gender = 'M'                THEN 1 ELSE 0 END) AS sex_m,
                 MAX(CASE WHEN rd.primary_lang_cd = '01'      THEN 1 ELSE 0 END) AS lang_spanish,
                 MAX(CASE WHEN rd.primary_lang_cd = '31'      THEN 1 ELSE 0 END) AS lang_portu,
                 MAX(CASE WHEN rd.primary_lang_cd <> '00' AND
                               rd.primary_lang_cd <> '01' AND
                               rd.primary_lang_cd <> '31'     THEN 1 ELSE 0 END) AS lang_other
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {medicaid_enrollment} me
              ON pop.riipl_id = me.riipl_id AND
                 lb.yrmo = me.yrmo
       LEFT JOIN {recip_demo} rd
              ON me.re_unique_id = rd.recipient_id
        GROUP BY pop.riipl_id
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    features = features.fillna(0)

    labels = {
        "SEX_M": "Sex is male",
        "LANG_SPANISH": "Primary language is Spanish",
        "LANG_PORTU": "Primary language is Portuguese",
        "LANG_OTHER":
        "Primary language is not English, Spanish, or Portuguese",
    }

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))
Пример #15
0
def main():
    """
    Social program payments from MEGA table
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,   
                 SUM(m.snap_payments)  AS snap_payments,
                 SUM(m.tanf_payments)  AS tanf_payments,
                 SUM(m.tdi_payments)   AS tdi_payments,
                 SUM(m.ui_payments)    AS ui_payments,
                 SUM(m.ssi_supplement) AS ssi_supplement,
                 SUM(m.gpa_payments)   AS gpa_payments,
                 SUM(m.ccap_payments)  AS ccap_payments
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {mega} m
              ON pop.riipl_id = m.riipl_id AND
                 lb.yrmo = m.month
        GROUP BY pop.riipl_id
        """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Payments are comprehensive, so a missing value means $0
    features = features.fillna(0)

    labels = {
        "SNAP_PAYMENTS"  : "Total Food Stamps payments",
        "TANF_PAYMENTS"  : "Total TANF payments",
        "TDI_PAYMENTS"   : "Total TDI payments",
        "UI_PAYMENTS"    : "Total Unemployment Insurance payments",
        "SSI_SUPPLEMENT" : "Total SSI supplement",
        "GPA_PAYMENTS"   : "Total GPA payments",
        "CCAP_PAYMENTS"  : "Total CCAP payments"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #16
0
def main():

    index = ["RIIPL_ID"]

    sql = """
          SELECT pop.riipl_id,
                 pop.injection
            FROM {population} pop
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = pd.read_sql(sql, cxn, index_col=index)

    labels = {"INJECTION": "Initial exposure was an opioid injection"}

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))
Пример #17
0
def main():
    """
    Size of DHS household and indicator for births in the DHS household from MEGA table
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 COUNT(DISTINCT m2.riipl_id) AS dhs_hh_size,
                 MAX(CASE WHEN m1.age > 1 AND m2.age < 1
                          THEN 1
                          ELSE 0 END)        AS dhs_hh_birth
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
      INNER JOIN {mega} m1
              ON pop.riipl_id = m1.riipl_id AND
                 lb.yrmo = m1.month
      INNER JOIN {mega} m2
              ON m1.dhs_hh_id = m2.dhs_hh_id AND
                 m1.month = m2.month
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = features.join(pd.read_sql(sql, cxn._connection).set_index(index))

    features = features.fillna(0)

    labels = {
        "DHS_HH_SIZE"  : "Size of household",
        "DHS_HH_BIRTH" : "New birth in household"
    }

    SaveFeatures(features, out, manifest, population, labels, bool_features=["DHS_HH_BIRTH"])
Пример #18
0
def main():
    """
    Demographics variables from MEGA table
    """

    sql = """
          SELECT pop.riipl_id,
                 CASE WHEN MAX(m.age) >= 18 AND MAX(m.age) < 24 THEN 1 ELSE 0 END  AS age_1,
                 CASE WHEN MAX(m.age) >= 24 AND MAX(m.age) < 30 THEN 1 ELSE 0 END  AS age_2,
                 CASE WHEN MAX(m.age) >= 30 AND MAX(m.age) < 40 THEN 1 ELSE 0 END  AS age_3,
                 CASE WHEN MAX(m.age) >= 40 AND MAX(m.age) < 55 THEN 1 ELSE 0 END  AS age_4,
                 CASE WHEN MAX(m.age) >= 55 AND MAX(m.age) < 65 THEN 1 ELSE 0 END  AS age_5,
                 CASE WHEN MAX(m.age) >= 65                     THEN 1 ELSE 0 END  AS age_6,
                 CASE WHEN AVG(m.bmi) < 18.5                    THEN 1 ELSE 0 END  AS bmi_under,
                 CASE WHEN AVG(m.bmi) >= 25 AND AVG(m.bmi) < 30 THEN 1 ELSE 0 END  AS bmi_over,
                 CASE WHEN AVG(m.bmi) >= 30                     THEN 1 ELSE 0 END  AS bmi_obese,
                 CASE WHEN COUNT(m.bmi) = 0                     THEN 1 ELSE 0 END  AS bmi_missing,
                 MAX(CASE WHEN(m.marital_status) = 1            THEN 1 ELSE 0 END) AS married,
                 CASE WHEN COUNT(m.marital_status) = 0          THEN 1 ELSE 0 END  AS married_missing,
                 MAX(CASE WHEN m.race = 'Black'                 THEN 1 ELSE 0 END) AS race_black,
                 MAX(CASE WHEN m.race = 'Hispanic'              THEN 1 ELSE 0 END) AS race_hispanic,
                 MAX(CASE WHEN m.race = 'Asian'                 THEN 1
                          WHEN m.race = 'Native American'       THEN 1
                          WHEN m.race = 'Other'                 THEN 1 ELSE 0 END) AS race_other,
                 CASE WHEN COUNT(m.race) = 0                    THEN 1 ELSE 0 END  AS race_missing
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {mega} m
              ON pop.riipl_id = m.riipl_id AND
                 lb.yrmo = m.month
        GROUP BY pop.riipl_id
        ORDER BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        features = pd.read_sql(sql, cxn._connection, index_col="RIIPL_ID")

    labels = {
        "AGE_1": "Age is 18-23",
        "AGE_2": "Age is 24-29",
        "AGE_3": "Age is 30-39",
        "AGE_4": "Age is 40-54",
        "AGE_5": "Age is 55-64",
        "AGE_6": "Age is 65+",
        "BMI_UNDER": "Body mass index is underweight",
        "BMI_OVER": "Body mass index is overweight",
        "BMI_OBESE": "Body mass index is obese",
        "BMI_MISSING": "Body mass index is missing",
        "MARRIED": "Married",
        "MARRIED_MISSING": "Marital status is missing",
        "RACE_BLACK": "Race is African American",
        "RACE_HISPANIC": "Ethnicity is Hispanic",
        "RACE_OTHER": "Race is Asian, Native American, or other",
        "RACE_MISSING": "Race is missing"
    }

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=list(labels))
Пример #19
0
def main():

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT pop.riipl_id,
                 COUNT(DISTINCT me.re_unique_id)                                           AS medicaid_n_ids,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%PREG%'     THEN 1 ELSE 0 END) AS medicaid_pregnant,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%PARTUM%'   THEN 1 ELSE 0 END) AS medicaid_postpartum,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%BLIND%'    THEN 1 ELSE 0 END) AS medicaid_blind,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%DISABLED%' THEN 1 ELSE 0 END) AS medicaid_disabled,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%ALIEN%'    THEN 1 ELSE 0 END) AS medicaid_alien,
                 MAX(CASE WHEN UPPER(dim.description) LIKE '%CHILD%'    THEN 1 ELSE 0 END) AS medicaid_child,
                 MAX(CASE WHEN dim.needy_ind = 'Medically'              THEN 1 ELSE 0 END) AS medicaid_med_needy,
                 MAX(NVL(me.managed_care, 0))                                              AS medicaid_managed_care,
                 MAX(NVL(me.premium_payment, 0))                                           AS medicaid_prem_payment,
                 MAX(CASE WHEN rxn.payer_cd = 'B'                       THEN 1 ELSE 0 END) AS medicaid_payer_bhddh,
                 MAX(CASE WHEN rxn.payer_cd = 'C'                       THEN 1 ELSE 0 END) AS medicaid_payer_cmap,
                 MAX(CASE WHEN rxn.payer_cd = 'D'                       THEN 1 ELSE 0 END) AS medicaid_payer_doc,
                 MAX(CASE WHEN rxn.payer_cd = 'H'                       THEN 1 ELSE 0 END) AS medicaid_payer_aids,
                 MAX(CASE WHEN rxn.payer_cd = 'O'                       THEN 1 ELSE 0 END) AS medicaid_payer_ors,
                 MAX(CASE WHEN rxn.payer_cd = 'R'                       THEN 1 ELSE 0 END) AS medicaid_payer_ripae
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {medicaid_enrollment} me
              ON pop.riipl_id = me.riipl_id AND
                 lb.yrmo = me.yrmo
       LEFT JOIN {dim_aid_ctg_cde} dim
              ON me.aid_category = dim.aid_ctg_cde
       LEFT JOIN {recip_x_ssn} rxn
              ON me.re_unique_id = rxn.recipient_id
        GROUP BY pop.riipl_id
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    features = features.fillna(0)

    labels = {
        "MEDICAID_N_IDS":
        "Number of unique Medicaid IDs",
        "MEDICAID_PREGNANT":
        "Medicaid eligibility category contains the word 'pregnant'",
        "MEDICAID_POSTPARTUM":
        "Medicaid eligibility category contains the word 'postpartum'",
        "MEDICAID_BLIND":
        "Medicaid eligibility category contains the word 'blind'",
        "MEDICAID_DISABLED":
        "Medicaid eligibility category contains the word 'disabled'",
        "MEDICAID_ALIEN":
        "Medicaid eligibility category contains the word 'alien'",
        "MEDICAID_CHILD":
        "Medicaid eligibility category contains the word 'child'",
        "MEDICAID_MED_NEEDY":
        "Eligible for Medicaid as medically needy",
        "MEDICAID_MANAGED_CARE":
        "Enrolled in Medicaid managed care",
        "MEDICAID_PREM_PAYMENT":
        "Enrolled in Medicaid premium payment plan",
        "MEDICAID_PAYER_BHDDH":
        "Medicaid payer code is BHDDH",
        "MEDICAID_PAYER_CMAP":
        "Enrolled in RI Community Medication Assistance Program",
        "MEDICAID_PAYER_DOC":
        "Medicaid payer code is Department of Corrections",
        "MEDICAID_PAYER_AIDS":
        "Medicaid payer code is Ryan White AIDS Drug Assistance Program",
        "MEDICAID_PAYER_ORS":
        "Medicaid payer code is ORS",
        "MEDICAID_PAYER_RIPAE":
        "Enrolled in RI Pharmaceutical Assistance to the Elderly"
    }

    SaveFeatures(features,
                 out,
                 manifest,
                 population,
                 labels,
                 bool_features=[f for f in labels if f != "MEDICAID_N_IDS"])
Пример #20
0
def main():

    index = ["RIIPL_ID"]
    pop = CachePopulationSubsets(population, index)
    pop["ROW_ID"] = np.arange(len(pop))
    outcomes = pd.read_csv(outcomes_file)
    words = pd.read_csv(words_file, index_col="WORD_ID")

    # Load counts and convert to CSR sparse matrix for efficient row slicing
    counts = mmread(counts_file).tocsr()

    # Further divide training data into training and validation sets for
    # selecting the optimal number of topics
    training = (pop["SUBSET"] == "TRAINING")
    np.random.seed(seed)
    subset = np.random.choice([True, False], len(training), p=[0.25, 0.75])
    validation = (training & subset)
    training = (training & ~subset)
    print(training.sum(), "training")
    print(validation.sum(), "validation")

    # Create training and validation outcomes
    y_train = outcomes.loc[training, "OUTCOME_ANY"].values
    y_validate = outcomes.loc[validation, "OUTCOME_ANY"].values
    print(y_train.sum(), "training outcomes")
    print(y_validate.sum(), "validation outcomes")

    # Transform raw counts to TF-IDF using IDF from the training set
    training = np.where(training)[0]
    validation = np.where(validation)[0]
    counts_train = counts[training, :]
    tfidf = TfidfTransformer()
    tfidf.fit(counts_train)
    counts = tfidf.transform(counts)
    counts_train = counts[training, :]
    counts_validate = counts[validation, :]

    # Select NMF model with best AUC performance on validation data
    best = 0
    best_auc = 0
    nmfs = []
    for i, n in enumerate(ntopics):
        print(n, "topics:")
        nmf = NMF(n, random_state=seed).fit(counts_train)
        nmfs.append(nmf)
        X_train = pd.DataFrame(nmf.transform(counts_train))
        X_train["intercept"] = 1
        logit = Logit(y_train, X_train).fit(maxiter=1000, method="cg")
        print(logit.summary())
        X_validate = pd.DataFrame(nmf.transform(counts_validate))
        X_validate["intercept"] = 1
        y_pred = logit.predict(X_validate)
        auc = roc_auc_score(y_validate, y_pred)
        print("AUC:", auc)
        if (auc - best_auc) > delta:
            best = i
            best_auc = auc
        else:
            break
    print("selected", ntopics[best], "topics")

    # Turn best NMF topics into features
    features = pd.DataFrame(nmfs[best].transform(counts))
    features.columns = [
        "MEDICAID_TOPIC_{}".format(i) for i in range(ntopics[best])
    ]
    features["RIIPL_ID"] = pop["RIIPL_ID"]
    features = features.set_index("RIIPL_ID")

    # Use the top 10 words in a topic as its description
    top10words = [
        " ".join(words.loc[i, "WORD"] for i in topic.argsort()[-11:-1])
        for topic in nmfs[best].components_
    ]
    descs = [
        "Topic {} ({})".format(i, words) for i, words in enumerate(top10words)
    ]
    labels = dict(zip(features.columns, descs))

    SaveFeatures(features, out, manifest, population, labels)
Пример #21
0
def main():
    """
    Industry worked in (NAICS code) from DLT wages
    """

    sql = """
          SELECT DISTINCT SUBSTR(naics4, 0, 2) AS naics2
            FROM {dlt_wage}
           ORDER BY naics2
          """.format(**globals())

    with cx_Oracle.connect("/") as cxn:
        naics = pd.read_sql(sql, cxn)

    pivot = ",".join(["'{0}' AS naics_{0}".format(n) for n in naics["NAICS2"]])

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT *
            FROM (   
                  SELECT pop.riipl_id,
                         SUBSTR(dw.naics4, 0, 2) AS naics2,
                         1 AS worked
                    FROM {population} pop
               LEFT JOIN (
                          SELECT DISTINCT
                                 riipl_id,
                                 yyq
                            FROM {lookback}
                         ) lb
                      ON pop.riipl_id = lb.riipl_id
              INNER JOIN {dlt_wage} dw
                      ON pop.riipl_id = dw.riipl_id AND
                         lb.yyq = dw.yyq
                GROUP BY pop.riipl_id, SUBSTR(dw.naics4, 0, 2)
                 ) 
           PIVOT (MAX(worked) FOR naics2 IN ({pivot}))
         """.format(pivot=pivot, **globals())

    with cx_Oracle.connect("/") as cxn:
        features = features.join(pd.read_sql(sql, cxn).set_index(index))

    # Drop NAICS categories that are missing for the entire population.
    features = features.dropna(axis=1, how="all").fillna(0).astype(int)

    # Missing indicator not needed - provided by WAGES_MISSING

    labels = {}
    naics2_labels = {
        "00": "Unknown",
        "02": "Unknown",
        "11": "Agriculture, Forestry, Fishing, and Hunting",
        "21": "Mining, Quarrying, and Oil and Gas Extraction",
        "22": "Utilities",
        "23": "Construction",
        "24": "Unknown",
        "31": "Manufacturing",
        "32": "Manufacturing",
        "33": "Manufacturing",
        "42": "Wholesale Trade",
        "44": "Retail Trade",
        "45": "Retail Trade",
        "48": "Transportation and Warehousing",
        "49": "Transportation and Warehousing",
        "50": "Unknown",
        "51": "Information",
        "52": "Finance and Insurance",
        "53": "Real Estate and Rental and Leasing",
        "54": "Professional, Scientific, and Technical Services",
        "55": "Management of Companies and Enterprises",
        "56":
        "Administrative and Support and Waste Management and Remediation Services",
        "61": "Educational Services",
        "62": "Health Care and Social Assistance",
        "71": "Arts, Entertainment, and Recreation",
        "72": "Accommodation and Food Services",
        "81": "Other Services (except Public Administration)",
        "84": "Unknown",
        "92": "Public Administration",
        "94": "Unknown",
        "99": "Unknown"
    }
    for n, sector in naics2_labels.items():
        label = "worked in {} sector (NAICS {}) during lookback period".format(
            sector, n)
        labels["NAICS_{}".format(n)] = label

    SaveFeatures(features,
                 out,
                 manifest, (population, index),
                 labels,
                 bool_features=list(labels))
Пример #22
0
def main():
    """
    This code generates the following features based on someone's home census block group:
    * Median income
    * Share below the federal povery line
    """

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    income = pd.read_csv(
        income_file,
        skiprows=1,
        na_values=["-", "**"],
        usecols=[
            "Id2",
            "Estimate; Median household income in the past 12 months (in 2015 Inflation-adjusted dollars)"
        ])
    income.columns = ["GEO_ID", "BLKGRP_MEDIANINCOME"]

    fpl = pd.read_csv(
        fpl_file,
        skiprows=1,
        na_values=["-", "**"],
        usecols=[
            "Id2", "Estimate; Total:",
            "Estimate; Income in the past 12 months below poverty level:"
        ])
    fpl.columns = ["GEO_ID", "TOTAL", "FPL"]
    fpl["BLKGRP_BELOWFPL"] = fpl.FPL / fpl.TOTAL
    fpl = fpl[["GEO_ID", "BLKGRP_BELOWFPL"]]

    sql = """
          SELECT pop.riipl_id,
                 STATS_MODE(TO_NUMBER(a.state || a.county || a.trct || a.blkgrp)) AS geo_id,
                 1 AS geocoded
            FROM (
                  SELECT pop.riipl_id,
                         MAX(a.obs_date) AS max_dt
                    FROM {population} pop
              INNER JOIN {address} a
                      ON pop.riipl_id = a.riipl_id AND
                         a.obs_date <= pop.initial_dt
                GROUP BY pop.riipl_id
                 ) pop
      INNER JOIN {address} a
              ON pop.riipl_id = a.riipl_id AND
                 pop.max_dt = a.obs_date
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        values = pd.read_sql(sql, cxn._connection)

    values = values.merge(income, how="left", on="GEO_ID")\
                   .merge(fpl,    how="left", on="GEO_ID")

    features = features.join(values.set_index(index))
    del features["GEO_ID"]

    # Mean imputation
    features["GEOCODED"] = features.GEOCODED.fillna(0)
    features["BLKGRP_MEDIANINCOME"] = features.BLKGRP_MEDIANINCOME.fillna(
        features.BLKGRP_MEDIANINCOME.mean())
    features["BLKGRP_BELOWFPL"] = features.BLKGRP_BELOWFPL.fillna(
        features.BLKGRP_BELOWFPL.mean())

    labels = {
        "GEOCODED": "Has a geocoded home address",
        "BLKGRP_MEDIANINCOME":
        "Block group's median annual household income (2015 dollars)",
        "BLKGRP_BELOWFPL": "Share of block group below poverty line"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #23
0
def main():

    ccs = pd.read_csv(ccs_file,
                      usecols=[0, 1],
                      quotechar="'",
                      skiprows=3,
                      names=["DIAG_CDE", "DISEASE"])
    ccs["DIAG_CDE"] = ccs.DIAG_CDE.str.strip()

    cci = pd.read_csv(cci_file,
                      usecols=[0, 2],
                      quotechar="'",
                      skiprows=2,
                      names=["DIAG_CDE", "CHRONIC"])
    cci = cci[cci.CHRONIC == 1]
    cci["DIAG_CDE"] = cci.DIAG_CDE.str.strip()

    index = ["RIIPL_ID"]
    features = CachePopulation(population, index).set_index(index)

    sql = """
          SELECT DISTINCT
                 pop.riipl_id,
                 d.diag_cde
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {diag_cde} d
              ON pop.riipl_id = d.riipl_id AND
                 dd.date_dt = d.claim_dt
          """.format(**globals())

    with Connection() as cxn:
        diags = pd.read_sql(sql, cxn._connection)

    ndisease = diags.merge(ccs, how="inner", on="DIAG_CDE")[index+["DISEASE"]]\
                    .groupby(index)\
                    .agg({"DISEASE": pd.Series.nunique})\
                    .rename(columns={"DISEASE": "MEDICAID_DISEASE_SCORE"})
    features = features.join(ndisease)

    nchronic = diags.merge(cci, how="inner", on="DIAG_CDE")[index+["CHRONIC"]]\
                    .groupby(index)\
                    .sum()\
                    .rename(columns={"CHRONIC": "MEDICAID_CHRONIC_SCORE"})
    features = features.join(nchronic)

    sql = """
          SELECT pop.riipl_id,
                 COUNT(DISTINCT p.proc_cde) AS medicaid_procedures
            FROM {population} pop
       LEFT JOIN {lookback} lb
              ON pop.riipl_id = lb.riipl_id
       LEFT JOIN {dim_date} dd
              ON lb.yrmo = dd.yrmo
      INNER JOIN {proc_cde} p
              ON pop.riipl_id = p.riipl_id AND
                 dd.date_dt = p.claim_dt
        GROUP BY pop.riipl_id
          """.format(**globals())

    with Connection() as cxn:
        nproc = pd.read_sql(sql, cxn._connection, index_col=index)

    features = features.join(nproc).fillna(0)

    # Add squared terms
    for x in [
            "MEDICAID_DISEASE_SCORE", "MEDICAID_CHRONIC_SCORE",
            "MEDICAID_PROCEDURES"
    ]:
        features["{}_SQ".format(x)] = features[x] * features[x]

    labels = {
        "MEDICAID_DISEASE_SCORE": "Number of AHRQ CCS diseases",
        "MEDICAID_CHRONIC_SCORE": "Number of AHRQ CCI chronic conditions",
        "MEDICAID_PROCEDURES": "Number of distinct procedures",
        "MEDICAID_DISEASE_SCORE_SQ": "Squared number of AHRQ CCS diseases",
        "MEDICAID_CHRONIC_SCORE_SQ":
        "Squared number of AHRQ CCI chronic conditions",
        "MEDICAID_PROCEDURES_SQ": "Squared number of distinct procedures"
    }

    SaveFeatures(features, out, manifest, population, labels)
Пример #24
0
index = ["RIIPL_ID"]

# Load features

features = reduce(lambda x, y: x.join(y), [
    pd.read_csv(feature_file, index_col=index)
    for feature_file in feature_files
])

# Load manifest

manifest = pd.read_csv(manifest_file,
                       sep="\t",
                       names=["var", "desc"],
                       index_col="var")["desc"].to_dict()

# Create pairwise interactions

interactions = pd.DataFrame(index=features.index)
labels = {}

for var1 in set1:
    for var2 in set2:
        varx = "{}_X_{}".format(var1, var2)
        interactions[varx] = features[var1] * features[var2]
        labels[varx] = "'{}' X '{}'".format(manifest[var1], manifest[var2])

SaveFeatures(interactions, out, out_manifest, population, labels)

# vim: expandtab sw=4 ts=4