def test_nan_fast_cov_just_x(self): logger.debug("*************happy path just x") x, _ = TestFastCov.build_nan_containing_x_y() ex_with_nan = numpy.cov(x, rowvar=False) logger.debug( "expected with nan's - ex_with_nan:\n{}".format(ex_with_nan)) r = fast_cov.nan_fast_cov(x) logger.debug("r:\n{}".format(r)) non_nan_locs = ~numpy.isnan(ex_with_nan) self.assertTrue( numpy.allclose(ex_with_nan[non_nan_locs], r[non_nan_locs])) check_nominal_nans = [] u = x[1:, 1] for i in range(3): t = x[1:, i] c = numpy.cov(t, u, bias=False)[0, 1] check_nominal_nans.append(c) logger.debug( "calculate entries that would be nan - check_nominal_nans: {}". format(check_nominal_nans)) self.assertTrue(numpy.allclose(check_nominal_nans, r[:, 1])) self.assertTrue(numpy.allclose(check_nominal_nans, r[1, :]))
def test_nan_fast_cov_all_nan(self): x = numpy.zeros(3) x[:] = numpy.nan x = x[:, numpy.newaxis] logger.debug("x:\n{}".format(x)) r = fast_cov.nan_fast_cov(x) logger.debug("r:\n{}".format(r)) self.assertEqual(1, numpy.sum(numpy.isnan(r)))
def test_nan_fast_cov_x_and_y(self): logger.debug("*************happy path x and y") x, y = TestFastCov.build_nan_containing_x_y() combined = numpy.hstack([x, y]) logger.debug("combined:\n{}".format(combined)) logger.debug("combined.shape: {}".format(combined.shape)) off_diag_ind = int(combined.shape[1] / 2) raw_ex = numpy.cov(combined, rowvar=False) logger.debug( "raw expected produced from numpy.cov on full combined - raw_ex:\n{}" .format(raw_ex)) ex = raw_ex[:off_diag_ind, off_diag_ind:] logger.debug("expected ex:\n{}".format(ex)) r = fast_cov.nan_fast_cov(x, y) logger.debug("r:\n{}".format(r)) non_nan_locs = ~numpy.isnan(ex) logger.debug("ex[non_nan_locs]: {}".format(ex[non_nan_locs])) logger.debug("r[non_nan_locs]: {}".format(r[non_nan_locs])) self.assertTrue(numpy.allclose(ex[non_nan_locs], r[non_nan_locs])) check_nominal_nans = [] t = x[1:, 1] for i in [1, 2]: u = y[1:, i] c = numpy.cov(t, u) check_nominal_nans.append(c[0, 1]) logger.debug( "calculate entries that would be nan - check_nominal_nans: {}". format(check_nominal_nans)) logger.debug("r values to compare to - r[1, 1:]: {}".format(r[1, 1:])) self.assertTrue(numpy.allclose(check_nominal_nans, r[1, 1:])) check_nominal_nans = [] u = y[:2, 0] for i in [0, 2]: t = x[:2, i] c = numpy.cov(t, u) check_nominal_nans.append(c[0, 1]) logger.debug( "calculate entries that would be nan - check_nominal_nans: {}". format(check_nominal_nans)) logger.debug("r values to compare to - r[[0,2], 0]: {}".format( r[[0, 2], 0])) self.assertTrue(numpy.allclose(check_nominal_nans, r[[0, 2], 0])) self.assertTrue( numpy.isnan(r[1, 0]), """expect this entry to be nan b/c for the intersection of x[:,1] and y[:,0] there is only one entry in common, therefore covariance is undefined""" )
def nan_fast_corr(x, y=None, destination=None): """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix between x and y (with dimensions OxP). If destination is provided, put the results there. In the language of statistics the columns are the variables and the rows are the observations. Args: x (numpy array-like) MxN in shape y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy memmap of a file) returns (numpy array-like) array of the covariance values for defaults (y=None), shape is NxN if y is provied, shape is NxP """ x_masked = numpy.ma.array(x, mask=numpy.isnan(x)) if y is None: y_masked = x_masked else: y_masked = numpy.ma.array(y, mask=numpy.isnan(y)) r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination) # calculate the standard deviation of the columns of each matrix, given the masking from the other _, _, var_x = calculate_moments_with_additional_mask( x_masked, y_masked.mask) std_x = numpy.sqrt(var_x) _, _, var_y = calculate_moments_with_additional_mask( y_masked, x_masked.mask) std_y = numpy.sqrt(var_y) numpy.divide(r, std_x.T, out=r) numpy.divide(r, std_y, out=r) return r