예제 #1
0
def getDataICD(patientID):
    # get in-patient / out-patient data
    inpatientData = getData(dataFolder,inpatientFilename)
    outpatientData = getData(dataFolder,outpatientFilename)

    # extract patients with input ID
    inpatientData = inpatientData.loc[inpatientData[idName].isin(patientID)]
    print 'extracted number of inpatient record is %d' %(inpatientData.shape[0])

    outpatientData = outpatientData.loc[outpatientData[idName].isin(patientID)]
    print 'extracted number of outpatient record is %d' %(outpatientData.shape[0])
    # concat extracted patient data
    dataICD = pd.concat([inpatientData, outpatientData])
    print 'extracted number of all records is %d' %(dataICD.shape[0])
    return dataICD
예제 #2
0
def P2c():
    dataAll = mergeInOutPatientCKD(dataCKD_inpatient, dataCKD_outpatient)

    # get 2008 and 2009 data
    dataAll2008 = dataAll[dataAll['Year']==2008]
    dataAll2009 = dataAll[dataAll['Year']==2009]

    # get all patients
    patient2008Stage5 = dataAll2008[dataAll2008['CKD']== 5]
    patientStage5All = dataAll2009.loc[dataAll2009[idName].isin(patient2008Stage5[idName].as_matrix())]

    # get patient from stage 5 to stage 6
    patientTrans = patientStage5All[patientStage5All['CKD']==6]
    idTrans = patientTrans[idName].drop_duplicates()

    # get all patients in stage 5
    patientStage5 = patient2008Stage5
    idAll = patientStage5[idName].drop_duplicates()

    print 'Total number of patients with CKD stage 5 in 2008 is: %d' %idAll.shape[0]
    print 'Total number of patients transit from stage 5 to stage 6 (final stage) is: %d' %idTrans.shape[0]

    # load selected patients' beneficiary in 2008
    beneficiaryFilename = 'DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv'
    dataBeneficiary2008 = getData(dataFolder, beneficiaryFilename)
    dataBeneficiary2008 = dataBeneficiary2008.loc[dataBeneficiary2008[idName].isin(idAll)]

    # get all the ICD variables for patients with stage 5 in 2008
    dataICD = getDataICD(idAll)

    # convert ICD code to CCD code
    dataCCD = convertICDtoCCS(dataICD)
    print dataCCD[dataCCD[idName] == 'FCBAB958EB8681CC'][diagnosisName]
예제 #3
0
def P3d():
    # read in data
    data = getData(dataFolder, filePattern % (2009))
    reimbursement2009 = getTotalReimbursement(dataFolder, filePattern, saveFilePattern, 2009)
    data["sumReimburse"] = reimbursement2009["sumReimburse"]
    plot2D_PCA(data[chronicCond])
    # define kmean clutering method
    from sklearn.cluster import KMeans

    kCluster = KMeans(n_clusters=2, random_state=25)
    labels = kCluster.fit_predict(data[chronicCond])
    print labels
예제 #4
0
def getTotalReimbursement(dataFolder, filePattern, saveFilePattern, year):
    saveFilename = saveFilePattern % (year)
    if not os.path.isfile(saveFilename):
        fileName = filePattern % (year)
        data = getData(dataFolder, fileName)
        # calculate all the reimbursement
        data["sumReimburse"] = data.apply(lambda row: row[reimName].sum(), axis=1)
        dataCleaned = data[[idName, "sumReimburse"]]
        # if key data has been extracted, read in extracted data; else preprocess data and save into csv format
        dataCleaned.to_csv(saveFilename)
    else:
        dataCleaned = pd.read_csv(saveFilename)
    return dataCleaned
예제 #5
0
def getCKD(dataFolder, fileName, saveFilename):
    if not os.path.isfile(saveFilename):
        data = getData(dataFolder, fileName)
        data = data[np.isfinite(data['CLM_FROM_DT'])]
        # parse year
        print 'Parse year'
        data['Year'] = data.apply(lambda row: time.strptime(str(int(row['CLM_FROM_DT'])),'%Y%m%d').tm_year, axis=1)
        # get CKD
        print 'Get CKD'
        data['CKD'] = data.apply(strMatching, axis=1)
        dataCleaned = data[[idName,'Year','CKD']]
        # if key data has been extracted, read in extracted data; else preprocess data and save into csv format
        dataCleaned.to_csv(saveFilename)
    else:
        dataCleaned = pd.read_csv(saveFilename)
    return dataCleaned
예제 #6
0
def P3c():
    data = getData(dataFolder, filePattern % (2009))
    reimbursement2009 = getTotalReimbursement(dataFolder, filePattern, saveFilePattern, 2009)
    data["sumReimburse"] = reimbursement2009["sumReimburse"]
    # split into training and testing
    train, test = train_test_split(data, test_size=0.3)
    print "Total data size: %d; training data size: %d; test data size: %d" % (
        data.shape[0],
        train.shape[0],
        test.shape[0],
    )

    # restricted linear regression
    print "Start training restricted linear regression"
    regRLS = linear_model.Ridge(alpha=0.5)
    regRLS.fit(train[chronicCond], train["sumReimburse"])
    # print "coefficients are: %d" %clfRidge.coef_
    print regRLS.predict(train[chronicCond]) - train["sumReimburse"]

    # neural network (perceptron)
    """