def LRFunc(self, measureType): ''' Linear regression using OLS cut-off leverage: 3k/n cut-off for influence: 1 cut-off for DFFITS 2*sqrt(k/n) cut-off for DFBETAS 2/sqrt(n) where k=1 ''' dft = self.dfA[(self.dfA.MEASURE_TYPE == measureType) & (self.dfA.FILTER_FLAG != 'WHO')].copy() reg = linear_model.LinearRegression() print(dft.MEASURE_VAL, dft.AGE) regression = OLS(dft.MEASURE_VAL, dft.AGE).fit() infl = regression.get_influence() test = regression.outlier_test() k = 1 N = len(dft) print(N) dft['OLS_BONFPVAL'] = test['bonf(p)'] dft['OLS_STUDENTRES'] = test['student_resid'] dft['OLS_INFLUENCE'] = infl.summary_frame().cooks_d dft['OLS_DFFITS'] = infl.summary_frame().dffits dft['OLS_DFB_AGE'] = infl.summary_frame().dfb_AGE dft['N'] = [N] * N coL, coI, coDf1, coDf2 = 3.0 * k / N, 1, 2 * (k / N)**0.5, 2 / (N**0.5) dft1 = dft[(abs(dft['OLS_INFLUENCE']) <= coI) & (abs(dft['OLS_DFFITS']) <= coDf1) & (abs(dft['OLS_DFB_AGE']) <= coDf2)] if len(dft1) <= 2: for idx, row in dft.iterrows(): self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_FEW_REMAIN' return reg.fit(dft1[['AGE']], dft1['SDS']) dft['pred1'] = reg.predict(dft[['AGE']]) dft['diff1'] = dft['SDS'] - dft['pred1'] stdVal = dft[dft.index.isin(dft1.index)].diff1.std() dft['STD_FOLD'] = dft.diff1 / stdVal self.stdVal[measureType] = stdVal self.coef[measureType] = reg.coef_[0] self.intercept[measureType] = reg.intercept_ for idx, row in dft.iterrows(): if abs(row.STD_FOLD) <= LRCutoffSD[measureType]: self.dfA.loc[idx, 'FILTER_FLAG'] = 'PLAUSIBLE' else: self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_OUTLIER' return