示例#1
0
    def LRFunc(self, measureType):
        '''
        Linear regression using OLS 
        cut-off leverage: 3k/n
        cut-off for influence: 1
        cut-off for DFFITS 2*sqrt(k/n)
        cut-off for DFBETAS 2/sqrt(n)  where k=1
        '''
        dft = self.dfA[(self.dfA.MEASURE_TYPE == measureType)
                       & (self.dfA.FILTER_FLAG != 'WHO')].copy()
        reg = linear_model.LinearRegression()
        print(dft.MEASURE_VAL, dft.AGE)
        regression = OLS(dft.MEASURE_VAL, dft.AGE).fit()
        infl = regression.get_influence()
        test = regression.outlier_test()

        k = 1
        N = len(dft)
        print(N)
        dft['OLS_BONFPVAL'] = test['bonf(p)']
        dft['OLS_STUDENTRES'] = test['student_resid']
        dft['OLS_INFLUENCE'] = infl.summary_frame().cooks_d
        dft['OLS_DFFITS'] = infl.summary_frame().dffits
        dft['OLS_DFB_AGE'] = infl.summary_frame().dfb_AGE
        dft['N'] = [N] * N

        coL, coI, coDf1, coDf2 = 3.0 * k / N, 1, 2 * (k / N)**0.5, 2 / (N**0.5)
        dft1 = dft[(abs(dft['OLS_INFLUENCE']) <= coI)
                   & (abs(dft['OLS_DFFITS']) <= coDf1) &
                   (abs(dft['OLS_DFB_AGE']) <= coDf2)]

        if len(dft1) <= 2:
            for idx, row in dft.iterrows():
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_FEW_REMAIN'
            return

        reg.fit(dft1[['AGE']], dft1['SDS'])
        dft['pred1'] = reg.predict(dft[['AGE']])
        dft['diff1'] = dft['SDS'] - dft['pred1']
        stdVal = dft[dft.index.isin(dft1.index)].diff1.std()
        dft['STD_FOLD'] = dft.diff1 / stdVal

        self.stdVal[measureType] = stdVal
        self.coef[measureType] = reg.coef_[0]
        self.intercept[measureType] = reg.intercept_

        for idx, row in dft.iterrows():
            if abs(row.STD_FOLD) <= LRCutoffSD[measureType]:
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'PLAUSIBLE'
            else:
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_OUTLIER'

        return