def createFeatures(): """ This is just a sample script function showing the functionality of widen_on_fields avg_and_max_fields """ data = clean.readH5Store("HHP_release3.h5") claim = data["claim"] drug = data["drug"] lab = data["lab"] member = data["member"] dih = data["dih"] dih = dih.join(pd.DataFrame({"Year":np.repeat("Y1",len(dih.index))})) dih_3 = data["dih_y3"] dih_3 = dih_3.join(pd.DataFrame({"Year":np.repeat("Y2",len(dih_3.index))})) days_in_hospital = dih.append(dih_3) days_in_hospital = days_in_hospital.set_index(["MemberID","Year"]) days_in_hospital.columns = ["NextYearTruncated","Target"] drug = member.merge(drug,on="MemberID") drug_lab = drug.merge(lab,how="outer",on=["MemberID","Year","DSFS"]) drug_start = datetime.datetime.now() drug_lab_count = count_drug_lab(drug_lab)# now indexed by MemberID and Year drug_end = datetime.datetime.now() logging.debug("drug_done. IT took %d seconds" % (drug_end - drug_start).seconds) claim_counting_fields = [["Specialty","PlaceSvc"],"LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex","ProcedureGroup"] widen_start = datetime.datetime.now() claims_counted = widen_on_fields(claim,claim_counting_fields) widen_end = datetime.datetime.now() logging.debug("widen_done. it took %d seconds"% (widen_end - widen_start).seconds) avg_fields = ["PayDelay"] avg_start = datetime.datetime.now() avg_frame = avg_and_max_fields(claim,avg_fields) avg_end = datetime.datetime.now() logging.debug("avg_done. it took %d seconds" % (avg_end - avg_start).seconds) features_frame = drug_lab_count.join(claims_counted,how="outer").join(avg_frame,how="outer").join(days_in_hospital,how="outer") data["features"] = features_frame clean.storeAsH5("HHP_release3.h5",data)
#!/usr/bin/env python import clean data = clean.loadRawData('./data') clean.storeAsH5('HHP_release3.h5', data)