def createFeatures(): """ This is just a sample script function showing the functionality of widen_on_fields avg_and_max_fields """ data = clean.readH5Store("HHP_release3.h5") claim = data["claim"] drug = data["drug"] lab = data["lab"] member = data["member"] dih = data["dih"] dih = dih.join(pd.DataFrame({"Year":np.repeat("Y1",len(dih.index))})) dih_3 = data["dih_y3"] dih_3 = dih_3.join(pd.DataFrame({"Year":np.repeat("Y2",len(dih_3.index))})) days_in_hospital = dih.append(dih_3) days_in_hospital = days_in_hospital.set_index(["MemberID","Year"]) days_in_hospital.columns = ["NextYearTruncated","Target"] drug = member.merge(drug,on="MemberID") drug_lab = drug.merge(lab,how="outer",on=["MemberID","Year","DSFS"]) drug_start = datetime.datetime.now() drug_lab_count = count_drug_lab(drug_lab)# now indexed by MemberID and Year drug_end = datetime.datetime.now() logging.debug("drug_done. IT took %d seconds" % (drug_end - drug_start).seconds) claim_counting_fields = [["Specialty","PlaceSvc"],"LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex","ProcedureGroup"] widen_start = datetime.datetime.now() claims_counted = widen_on_fields(claim,claim_counting_fields) widen_end = datetime.datetime.now() logging.debug("widen_done. it took %d seconds"% (widen_end - widen_start).seconds) avg_fields = ["PayDelay"] avg_start = datetime.datetime.now() avg_frame = avg_and_max_fields(claim,avg_fields) avg_end = datetime.datetime.now() logging.debug("avg_done. it took %d seconds" % (avg_end - avg_start).seconds) features_frame = drug_lab_count.join(claims_counted,how="outer").join(avg_frame,how="outer").join(days_in_hospital,how="outer") data["features"] = features_frame clean.storeAsH5("HHP_release3.h5",data)
import numpy as np import clean def widen_on_fields(data,fields_counted): """ This function takes in a dataFrame and list of fields. It iterates over fields and returns count each possible field occured per year DataFrame returned is MultiIndex on MemberID and Year """ rows = [data.MemberID, data.Year] res = None for field in fields_counted: isList = 1 if isinstance(field, (list, tuple, np.ndarray)) else 0 cols = [data.ix[:, i] for i in field] if isList else data.ix[:, field] df = pd.crosstab(rows=rows, cols=cols) key = "_".join(field) if isList else field df.columns = [key + '_' + str(i) for i in df.columns] res = df if res is None else res.join(df, how="outer") return res data = clean.readH5Store("HHP_release3.h5") claim = data["claim"] df = widen_on_fields(claim, ["Specialty", "PlaceSvc","LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex", "ProcedureGroup", ["Specialty", "PlaceSvc"]]) print len(df), "rows" print df.columns #print df[:5]
def widen_on_fields(data, fields_counted): """ This function takes in a dataFrame and list of fields. It iterates over fields and returns count each possible field occured per year DataFrame returned is MultiIndex on MemberID and Year """ rows = [data.MemberID, data.Year] res = None for field in fields_counted: isList = 1 if isinstance(field, (list, tuple, np.ndarray)) else 0 cols = [data.ix[:, i] for i in field] if isList else data.ix[:, field] df = pd.crosstab(rows=rows, cols=cols) key = "_".join(field) if isList else field df.columns = [key + '_' + str(i) for i in df.columns] res = df if res is None else res.join(df, how="outer") return res data = clean.readH5Store("HHP_release3.h5") claim = data["claim"] df = widen_on_fields(claim, [ "Specialty", "PlaceSvc", "LengthOfStay", "PrimaryConditionGroup", "CharlsonIndex", "ProcedureGroup", ["Specialty", "PlaceSvc"] ]) print len(df), "rows" print df.columns #print df[:5]