def getDataICD(patientID): # get in-patient / out-patient data inpatientData = getData(dataFolder,inpatientFilename) outpatientData = getData(dataFolder,outpatientFilename) # extract patients with input ID inpatientData = inpatientData.loc[inpatientData[idName].isin(patientID)] print 'extracted number of inpatient record is %d' %(inpatientData.shape[0]) outpatientData = outpatientData.loc[outpatientData[idName].isin(patientID)] print 'extracted number of outpatient record is %d' %(outpatientData.shape[0]) # concat extracted patient data dataICD = pd.concat([inpatientData, outpatientData]) print 'extracted number of all records is %d' %(dataICD.shape[0]) return dataICD
def P2c(): dataAll = mergeInOutPatientCKD(dataCKD_inpatient, dataCKD_outpatient) # get 2008 and 2009 data dataAll2008 = dataAll[dataAll['Year']==2008] dataAll2009 = dataAll[dataAll['Year']==2009] # get all patients patient2008Stage5 = dataAll2008[dataAll2008['CKD']== 5] patientStage5All = dataAll2009.loc[dataAll2009[idName].isin(patient2008Stage5[idName].as_matrix())] # get patient from stage 5 to stage 6 patientTrans = patientStage5All[patientStage5All['CKD']==6] idTrans = patientTrans[idName].drop_duplicates() # get all patients in stage 5 patientStage5 = patient2008Stage5 idAll = patientStage5[idName].drop_duplicates() print 'Total number of patients with CKD stage 5 in 2008 is: %d' %idAll.shape[0] print 'Total number of patients transit from stage 5 to stage 6 (final stage) is: %d' %idTrans.shape[0] # load selected patients' beneficiary in 2008 beneficiaryFilename = 'DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv' dataBeneficiary2008 = getData(dataFolder, beneficiaryFilename) dataBeneficiary2008 = dataBeneficiary2008.loc[dataBeneficiary2008[idName].isin(idAll)] # get all the ICD variables for patients with stage 5 in 2008 dataICD = getDataICD(idAll) # convert ICD code to CCD code dataCCD = convertICDtoCCS(dataICD) print dataCCD[dataCCD[idName] == 'FCBAB958EB8681CC'][diagnosisName]
def P3d(): # read in data data = getData(dataFolder, filePattern % (2009)) reimbursement2009 = getTotalReimbursement(dataFolder, filePattern, saveFilePattern, 2009) data["sumReimburse"] = reimbursement2009["sumReimburse"] plot2D_PCA(data[chronicCond]) # define kmean clutering method from sklearn.cluster import KMeans kCluster = KMeans(n_clusters=2, random_state=25) labels = kCluster.fit_predict(data[chronicCond]) print labels
def getTotalReimbursement(dataFolder, filePattern, saveFilePattern, year): saveFilename = saveFilePattern % (year) if not os.path.isfile(saveFilename): fileName = filePattern % (year) data = getData(dataFolder, fileName) # calculate all the reimbursement data["sumReimburse"] = data.apply(lambda row: row[reimName].sum(), axis=1) dataCleaned = data[[idName, "sumReimburse"]] # if key data has been extracted, read in extracted data; else preprocess data and save into csv format dataCleaned.to_csv(saveFilename) else: dataCleaned = pd.read_csv(saveFilename) return dataCleaned
def getCKD(dataFolder, fileName, saveFilename): if not os.path.isfile(saveFilename): data = getData(dataFolder, fileName) data = data[np.isfinite(data['CLM_FROM_DT'])] # parse year print 'Parse year' data['Year'] = data.apply(lambda row: time.strptime(str(int(row['CLM_FROM_DT'])),'%Y%m%d').tm_year, axis=1) # get CKD print 'Get CKD' data['CKD'] = data.apply(strMatching, axis=1) dataCleaned = data[[idName,'Year','CKD']] # if key data has been extracted, read in extracted data; else preprocess data and save into csv format dataCleaned.to_csv(saveFilename) else: dataCleaned = pd.read_csv(saveFilename) return dataCleaned
def P3c(): data = getData(dataFolder, filePattern % (2009)) reimbursement2009 = getTotalReimbursement(dataFolder, filePattern, saveFilePattern, 2009) data["sumReimburse"] = reimbursement2009["sumReimburse"] # split into training and testing train, test = train_test_split(data, test_size=0.3) print "Total data size: %d; training data size: %d; test data size: %d" % ( data.shape[0], train.shape[0], test.shape[0], ) # restricted linear regression print "Start training restricted linear regression" regRLS = linear_model.Ridge(alpha=0.5) regRLS.fit(train[chronicCond], train["sumReimburse"]) # print "coefficients are: %d" %clfRidge.coef_ print regRLS.predict(train[chronicCond]) - train["sumReimburse"] # neural network (perceptron) """