示例#1
0
文件: features.py 项目: szs8/kaggle
def createFeatures():
    """
    This is just a sample script function showing the functionality of 
    widen_on_fields
    avg_and_max_fields
    """
    data   =  clean.readH5Store("HHP_release3.h5")
    claim  = data["claim"]
    drug   = data["drug"]
    lab    = data["lab"]
    member = data["member"]
    dih    = data["dih"]
    dih    = dih.join(pd.DataFrame({"Year":np.repeat("Y1",len(dih.index))}))
    dih_3  = data["dih_y3"]
    dih_3  = dih_3.join(pd.DataFrame({"Year":np.repeat("Y2",len(dih_3.index))}))
    days_in_hospital   = dih.append(dih_3)
    days_in_hospital = days_in_hospital.set_index(["MemberID","Year"])
    days_in_hospital.columns =  ["NextYearTruncated","Target"]
    drug   = member.merge(drug,on="MemberID")
    drug_lab = drug.merge(lab,how="outer",on=["MemberID","Year","DSFS"])
    drug_start = datetime.datetime.now()
    drug_lab_count = count_drug_lab(drug_lab)# now indexed by MemberID and Year
    drug_end   = datetime.datetime.now()
    logging.debug("drug_done.  IT took %d seconds" % (drug_end - drug_start).seconds)
    claim_counting_fields = [["Specialty","PlaceSvc"],"LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex","ProcedureGroup"]
    widen_start = datetime.datetime.now()
    claims_counted = widen_on_fields(claim,claim_counting_fields)
    widen_end = datetime.datetime.now()
    logging.debug("widen_done.  it took %d seconds"% (widen_end - widen_start).seconds)
    avg_fields = ["PayDelay"]
    avg_start = datetime.datetime.now()
    avg_frame = avg_and_max_fields(claim,avg_fields)
    avg_end = datetime.datetime.now()
    logging.debug("avg_done.  it took %d seconds" % (avg_end - avg_start).seconds)
    features_frame = drug_lab_count.join(claims_counted,how="outer").join(avg_frame,how="outer").join(days_in_hospital,how="outer")
    data["features"] = features_frame
    clean.storeAsH5("HHP_release3.h5",data)
示例#2
0
#!/usr/bin/env python

import clean

data = clean.loadRawData('./data')
clean.storeAsH5('HHP_release3.h5', data)