def test_multiple_regression_big_small(self):
     ''' test multiple regression with a small array, and a large array
     '''
     for n in [50, 500000]:
         x, y, x_with_intercept = make_arrays(n=n, k=10)
         
         lstsq_fit = lstsq(x_with_intercept, y)
         regressor_fit = linregress(x, y)
         
         # check the betas are tightly correlated
         corr = numpy.corrcoef(lstsq_fit[0][:-1], regressor_fit.coef_[:-1])[0, 1] ** 2
         self.assertTrue(corr > 0.99999)
         
         abs_diff = difference(lstsq_fit[0][:-1], regressor_fit.coef_[:-1])
         self.assertTrue(abs_diff.max() < 1e-5)
예제 #2
0
    def test_multiple_regression(self):
        ''' check a single large regression with many different linear models.
        
        Skip p-value checks for other tests, as statsmodels is too slow. Also,
        this uses arrasy of random numbers, i.e. random betas and p-values.
        '''
        x, y, x_with_intercept = make_arrays(n=5000, k=250)

        # mainly compare lstsq from scipy with regressor, but lstsq lacks p-values
        # so we need statsnmodels OLS for p-values, but that is very slow
        lstsq_fit = lstsq(x_with_intercept, y)
        regressor_fit = linregress(x, y)
        sm_fit = OLS(y, x_with_intercept).fit()
        sk_fit = LinearRegression().fit(x, y)

        # check that the betas are very tightly correlated
        corr = numpy.corrcoef(lstsq_fit[0][:-1],
                              regressor_fit.coef_[:-1])[0, 1]**2
        self.assertTrue(corr > 0.9999999)

        # check that lstsq betas correlate with sk_fit betas for extra sanity
        corr = numpy.corrcoef(lstsq_fit[0][:-1], sk_fit.coef_)[0, 1]**2
        self.assertTrue(corr > 0.9999999)

        # check the beta values are very close. They aren't identical, as this
        # package uses 32-bit floats, but the others convert to 64-bit doubles.
        # Differences should be on the order of 1e-8, which is the usual delta
        # between a 64-bit float and its 32-bit representation (for values
        # around 0.5). Float differences accumulate to around 2e-6 at most,
        # which makes a bigger relative difference for betas near zero.
        abs_diff = difference(lstsq_fit[0][:-1], regressor_fit.coef_[:-1])
        self.assertTrue(abs_diff.max() < 5e-6)

        # check the p-values are nearly identical in log10 space, and correlate
        p_delta = abs(
            numpy.log10(regressor_fit.pvalue) - numpy.log10(sm_fit.pvalues))
        self.assertTrue(p_delta.max() < 1e-3)
        corr = numpy.corrcoef(numpy.log10(regressor_fit.pvalue),
                              numpy.log10(sm_fit.pvalues))[0, 1]**2
        self.assertTrue(corr > 0.9999999)
 def test_regression_correlated(self):
     ''' check multiple regresion, where y-values depend on the x columns
     '''
     x, y, x_with_intercept = make_arrays(n=50000, k=10)
     
     # define some effect sizes (which decline across the columns)
     betas = numpy.logspace(0, -10, num=x.shape[1])
     y = (x * betas).sum(axis=1)
     
     lstsq_fit = lstsq(x_with_intercept, y)
     regressor_fit = linregress(x, y)
     
     # check differences versus the predefined betas
     diff = difference(betas, regressor_fit.coef_[:-1])
     self.assertTrue(diff.max() < 5e-6)
     
     corr = numpy.corrcoef(betas, regressor_fit.coef_[:-1])[0, 1] ** 2
     self.assertTrue(corr > 0.9999999)
     
     # and check difference versus lstsq fit
     corr = numpy.corrcoef(lstsq_fit[0][:-1], regressor_fit.coef_[:-1])[0, 1] ** 2
     self.assertTrue(corr > 0.9999999)