def test_allowed_missing_doesnt_double_count(self): # Test that allowed_missing only counts a row as missing one # observation if it's missing in both the dependent and independent # variable. rand = np.random.RandomState(42) true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) independent = as_column(np.linspace(-5.0, 5.0, 30)) noise = as_column(rand.uniform(-0.1, 0.1, 30)) dependents = 1.0 + true_betas * independent + noise # Each column has three nans in the grid. dependent_nan_grid = np.array( [ [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 1], [1, 1, 0, 0, 1], [1, 1, 1, 0, 0], ], dtype=bool, ) # There are also two nans in the independent data. independent_nan_grid = np.array([[0], [0], [1], [1], [0]], dtype=bool) dependents[10:15][dependent_nan_grid] = np.nan independent[10:15][independent_nan_grid] = np.nan # With only two allowed missing values, everything should come up nan, # because column has at least 3 nans in the dependent data. result2 = vectorized_beta(dependents, independent, allowed_missing=2) assert_equal(np.isnan(result2), np.array([True, True, True, True, True])) # With three allowed missing values, the first and last columns should # produce a value, because they have nans at the same rows where the # independent data has nans. result3 = vectorized_beta(dependents, independent, allowed_missing=3) assert_equal(np.isnan(result3), np.array([False, True, True, True, False])) # With four allowed missing values, everything but the middle column # should produce a value. The middle column will have 5 nans because # the dependent nans have no overlap with the independent nans. result4 = vectorized_beta(dependents, independent, allowed_missing=4) assert_equal(np.isnan(result4), np.array([False, False, True, False, False])) # With five allowed missing values, everything should produce a value. result5 = vectorized_beta(dependents, independent, allowed_missing=5) assert_equal(np.isnan(result5), np.array([False, False, False, False, False]))
def test_allowed_missing_doesnt_double_count(self): # Test that allowed_missing only counts a row as missing one # observation if it's missing in both the dependent and independent # variable. rand = np.random.RandomState(42) true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) independent = as_column(np.linspace(-5., 5., 30)) noise = as_column(rand.uniform(-.1, .1, 30)) dependents = 1.0 + true_betas * independent + noise # Each column has three nans in the grid. dependent_nan_grid = np.array([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 1], [1, 1, 0, 0, 1], [1, 1, 1, 0, 0]], dtype=bool) # There are also two nans in the independent data. independent_nan_grid = np.array([[0], [0], [1], [1], [0]], dtype=bool) dependents[10:15][dependent_nan_grid] = np.nan independent[10:15][independent_nan_grid] = np.nan # With only two allowed missing values, everything should come up nan, # because column has at least 3 nans in the dependent data. result2 = vectorized_beta(dependents, independent, allowed_missing=2) assert_equal(np.isnan(result2), np.array([True, True, True, True, True])) # With three allowed missing values, the first and last columns should # produce a value, because they have nans at the same rows where the # independent data has nans. result3 = vectorized_beta(dependents, independent, allowed_missing=3) assert_equal(np.isnan(result3), np.array([False, True, True, True, False])) # With four allowed missing values, everything but the middle column # should produce a value. The middle column will have 5 nans because # the dependent nans have no overlap with the independent nans. result4 = vectorized_beta(dependents, independent, allowed_missing=4) assert_equal(np.isnan(result4), np.array([False, False, True, False, False])) # With five allowed missing values, everything should produce a value. result5 = vectorized_beta(dependents, independent, allowed_missing=5) assert_equal(np.isnan(result5), np.array([False, False, False, False, False]))
def test_produce_nans_when_too_much_missing_data(self, nan_offset): rand = np.random.RandomState(42) true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) independent = as_column(np.linspace(-5., 5., 30)) noise = as_column(rand.uniform(-.1, .1, 30)) dependents = 1.0 + true_betas * independent + noise # Write nans in a triangular pattern into the middle of the dependent # array. nan_grid = np.array([[1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]], dtype=bool) num_nans = nan_grid.sum(axis=0) # Move the grid around in the parameterized tests. The positions # shouldn't matter. dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan for allowed_missing in range(7): results = vectorized_beta(dependents, independent, allowed_missing) for i, expected in enumerate(true_betas): result = results[i] expect_nan = num_nans[i] > allowed_missing true_beta = true_betas[i] if expect_nan: self.assertTrue(np.isnan(result)) else: self.assertTrue(np.abs(result - true_beta) < 0.01)
def test_produce_nans_when_too_much_missing_data(self, nan_offset): rand = np.random.RandomState(42) true_betas = np.array([-0.5, 0.0, 0.5, 1.0, 1.5]) independent = as_column(np.linspace(-5., 5., 30)) noise = as_column(rand.uniform(-.1, .1, 30)) dependents = 1.0 + true_betas * independent + noise # Write nans in a triangular pattern into the middle of the dependent # array. nan_grid = np.array([[1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]], dtype=bool) num_nans = nan_grid.sum(axis=0) # Move the grid around in the parameterized tests. The positions # shouldn't matter. dependents[10 + nan_offset:15 + nan_offset][nan_grid] = np.nan for allowed_missing in range(7): results = vectorized_beta(dependents, independent, allowed_missing) for i, expected in enumerate(true_betas): result = results[i] expect_nan = num_nans[i] > allowed_missing true_beta = true_betas[i] if expect_nan: self.assertTrue(np.isnan(result)) else: self.assertTrue(np.abs(result - true_beta) < 0.01)
def compute(self, today, assets, out, closes, sectors): res = np.zeros(closes.shape[1]) change_ratio = np.diff(closes, axis=0) / closes[:-1] latest_sectors = sectors[-1] stock_in_sector = latest_sectors == self.sector_code change_ratio_in_sector = change_ratio[:, stock_in_sector] # epsilon = 0.000001 # nan_locs = np.where(np.isnan(change_ratio_in_sector))[1] # 列 # print(assets[np.unique(nan_locs)]) # change_ratio_in_sector = np.where(np.isnan(change_ratio_in_sector), epsilon, change_ratio_in_sector) # 行业收益率 sector_returns = nanmean(change_ratio_in_sector, axis=1).reshape(-1, 1) allowed_missing = int(self.window_length * 0.25) # 行业内各股票收益率基于行业平均收益率回归得到各股票的β值,即敞口 beta = vectorized_beta( dependents=change_ratio_in_sector, independent=sector_returns, allowed_missing=allowed_missing, ) # 更新β值,其余部分为0 res[stock_in_sector] = beta out[:] = res
def compare_with_empyrical(self, dependents, independent): INFINITY = 1000000 # close enough result = vectorized_beta( dependents, independent, allowed_missing=INFINITY, ) expected = np.array([ empyrical_beta(dependents[:, i].ravel(), independent.ravel()) for i in range(dependents.shape[1]) ]) assert_equal(result, expected, array_decimal=7) return result
def compare_with_empyrical(self, dependents, independent): INFINITY = 1000000 # close enough result = vectorized_beta( dependents, independent, allowed_missing=INFINITY, ) expected = np.array([ empyrical_beta(dependents[:, i].ravel(), independent.ravel()) for i in range(dependents.shape[1]) ]) assert_equal(result, expected, array_decimal=7) return result