def compute(self, today, asset_ids, out, close): diffs = np.diff(close, axis=0) ups = nanmean(np.clip(diffs, 0, np.inf), axis=0) downs = abs(nanmean(np.clip(diffs, -np.inf, 0), axis=0)) rsi = np.zeros(len(out)) evaluate( "100 - (100 / (1 + (ups / downs)))", local_dict={ 'ups': ups, 'downs': downs }, global_dict={}, out=rsi, ) difference = BBSTD * nanstd(close, axis=0) middle = nanmean(close, axis=0) upper = middle + difference lower = middle - difference for i in range(len(out)): out[i] = 0 if rsi[i] < RSI_LOWER: out[i] += RSI_LOWER / rsi[i] elif rsi[i] > RSI_UPPER: out[i] -= rsi[i] / RSI_UPPER prices = close[:, i] if prices[-1] < lower[i]: out[i] += lower[i] / prices[-1] elif prices[-1] > upper[i]: out[i] -= prices[-1] / upper[i] if TREND_FOLLOW: out[i] *= -1
def fast_corr(m0, m1): """Improving the speed of correlation""" nan = np.nan isnan = np.isnan N, M = m0.shape out = np.full(M, nan) allowed_missing_count = int(0.25 * N) independent = np.where( # shape: (N, M) isnan(m0), nan, m1, ) ind_residual = independent - nanmean(independent, axis=0) # shape: (N, M) covariances = nanmean(ind_residual * m0, axis=0) # shape: (M,) # corr(x,y) = cov(x,y)/std(x)/std(y) std_v = nanstd(m0, axis=0) # std(X) could reuse ind_residual for possible speedup np.divide(covariances, std_v, out=out) std_v = nanstd(m1, axis=0) # std(Y) np.divide(out, std_v, out=out) # handle NaNs nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count out[nanlocs] = nan return out
def compute(self, today, assets, out, closes): diffs = diff(closes, axis=0) ups = nanmean(clip(diffs, 0, inf), axis=0) downs = abs(nanmean(clip(diffs, -inf, 0), axis=0)) return evaluate( "100 - (100 / (1 + (ups / downs)))", local_dict={"ups": ups, "downs": downs}, global_dict={}, out=out )
def compute(self, today, assets, out, closes): diffs = diff(closes, axis=0) ups = nanmean(clip(diffs, 0, inf), axis=0) downs = abs(nanmean(clip(diffs, -inf, 0), axis=0)) return evaluate( "100 - (100 / (1 + (ups / downs)))", local_dict={'ups': ups, 'downs': downs}, global_dict={}, out=out, )
def compute(self, today, assets, out, closes, sectors): res = np.zeros(closes.shape[1]) change_ratio = np.diff(closes, axis=0) / closes[:-1] latest_sectors = sectors[-1] stock_in_sector = latest_sectors == self.sector_code change_ratio_in_sector = change_ratio[:, stock_in_sector] # epsilon = 0.000001 # nan_locs = np.where(np.isnan(change_ratio_in_sector))[1] # 列 # print(assets[np.unique(nan_locs)]) # change_ratio_in_sector = np.where(np.isnan(change_ratio_in_sector), epsilon, change_ratio_in_sector) # 行业收益率 sector_returns = nanmean(change_ratio_in_sector, axis=1).reshape(-1, 1) allowed_missing = int(self.window_length * 0.25) # 行业内各股票收益率基于行业平均收益率回归得到各股票的β值,即敞口 beta = vectorized_beta( dependents=change_ratio_in_sector, independent=sector_returns, allowed_missing=allowed_missing, ) # 更新β值,其余部分为0 res[stock_in_sector] = beta out[:] = res
def get_simple_transform(self, asset, transform_name, dt, data_frequency, bars=None): if transform_name == "returns": # returns is always calculated over the last 2 days, regardless # of the simulation's data frequency. hst = self.get_history_window([asset], dt, 2, "1d", "price", ffill=True)[asset] return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0] if bars is None: raise ValueError("bars cannot be None!") if data_frequency == "minute": freq_str = "1m" calculated_bar_count = int( self._get_minute_count_for_transform(dt, bars)) else: freq_str = "1d" calculated_bar_count = bars price_arr = self.get_history_window([asset], dt, calculated_bar_count, freq_str, "price", ffill=True)[asset] if transform_name == "mavg": return nanmean(price_arr) elif transform_name == "stddev": return nanstd(price_arr, ddof=1) elif transform_name == "vwap": volume_arr = self.get_history_window([asset], dt, calculated_bar_count, freq_str, "volume", ffill=True)[asset] vol_sum = nansum(volume_arr) try: ret = nansum(price_arr * volume_arr) / vol_sum except ZeroDivisionError: ret = np.nan return ret
def fast_cov(m0, m1): """Improving the speed of cov()""" nan = np.nan isnan = np.isnan N, M = m0.shape #out = np.full(M, nan) allowed_missing_count = int(0.25 * N) independent = np.where( # shape: (N, M) isnan(m0), nan, m1, ) ind_residual = independent - nanmean(independent, axis=0) # shape: (N, M) covariances = nanmean(ind_residual * m0, axis=0) # shape: (M,) nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count covariances[nanlocs] = nan return covariances
def zscore(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that Z-Scores each day's results. The Z-Score of a row is defined as:: (row - row.mean()) / row.stddev() If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means and standard deviations, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, z-scoring the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when Z-Scoring. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute Z-Scores. Returns ------- zscored : zipline.pipeline.Factor A Factor producing that z-scores the output of self. Notes ----- Mean and standard deviation are sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.zscore(mask=base.percentile_between(1, 99)) ``zscore()`` is only supported on Factors of dtype float64. Example ------- See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth example of the semantics for ``mask`` and ``groupby``. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: (row - nanmean(row)) / nanstd(row), factor=self, mask=mask, groupby=groupby, )
def get_simple_transform(self, asset, transform_name, dt, data_frequency, bars=None): if transform_name == "returns": # returns is always calculated over the last 2 days, regardless # of the simulation's data frequency. hst = self.get_history_window( [asset], dt, 2, "1d", "price", ffill=True )[asset] return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0] if bars is None: raise ValueError("bars cannot be None!") if data_frequency == "minute": freq_str = "1m" calculated_bar_count = int(self._get_minute_count_for_transform( dt, bars )) else: freq_str = "1d" calculated_bar_count = bars price_arr = self.get_history_window( [asset], dt, calculated_bar_count, freq_str, "price", ffill=True )[asset] if transform_name == "mavg": return nanmean(price_arr) elif transform_name == "stddev": return nanstd(price_arr, ddof=1) elif transform_name == "vwap": volume_arr = self.get_history_window( [asset], dt, calculated_bar_count, freq_str, "volume", ffill=True )[asset] vol_sum = nansum(volume_arr) try: ret = nansum(price_arr * volume_arr) / vol_sum except ZeroDivisionError: ret = np.nan return ret
def compute(self, today, assets, out, close, volume): out[:] = nanmean(close * volume, axis=0)
def compute(self, today, assets, out, data): out[:] = nanmean(data, axis=0)
def demean(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that computes ``self`` and subtracts the mean from row of the result. If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, de-meaning the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when computing means. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute means. Example ------- Let ``f`` be a Factor which would produce the following output:: AAPL MSFT MCD BK 2017-03-13 1.0 2.0 3.0 4.0 2017-03-14 1.5 2.5 3.5 1.0 2017-03-15 2.0 3.0 4.0 1.5 2017-03-16 2.5 3.5 1.0 2.0 Let ``c`` be a Classifier producing the following output:: AAPL MSFT MCD BK 2017-03-13 1 1 2 2 2017-03-14 1 1 2 2 2017-03-15 1 1 2 2 2017-03-16 1 1 2 2 Let ``m`` be a Filter producing the following output:: AAPL MSFT MCD BK 2017-03-13 False True True True 2017-03-14 True False True True 2017-03-15 True True False True 2017-03-16 True True True False Then ``f.demean()`` will subtract the mean from each row produced by ``f``. :: AAPL MSFT MCD BK 2017-03-13 -1.500 -0.500 0.500 1.500 2017-03-14 -0.625 0.375 1.375 -1.125 2017-03-15 -0.625 0.375 1.375 -1.125 2017-03-16 0.250 1.250 -1.250 -0.250 ``f.demean(mask=m)`` will subtract the mean from each row, but means will be calculated ignoring values on the diagonal, and NaNs will written to the diagonal in the output. Diagonal values are ignored because they are the locations where the mask ``m`` produced False. :: AAPL MSFT MCD BK 2017-03-13 NaN -1.000 0.000 1.000 2017-03-14 -0.500 NaN 1.500 -1.000 2017-03-15 -0.166 0.833 NaN -0.666 2017-03-16 0.166 1.166 -1.333 NaN ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and MCD/BK from their respective entries. The AAPL/MSFT are grouped together because both assets always produce 1 in the output of the classifier ``c``. Similarly, MCD/BK are grouped together because they always produce 2. :: AAPL MSFT MCD BK 2017-03-13 -0.500 0.500 -0.500 0.500 2017-03-14 -0.500 0.500 1.250 -1.250 2017-03-15 -0.500 0.500 1.250 -1.250 2017-03-16 -0.500 0.500 -0.500 0.500 ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on the diagonal , and NaNs will be written to the diagonal in the output. :: AAPL MSFT MCD BK 2017-03-13 NaN 0.000 -0.500 0.500 2017-03-14 0.000 NaN 1.250 -1.250 2017-03-15 -0.500 0.500 NaN 0.000 2017-03-16 -0.500 0.500 0.000 NaN Notes ----- Mean is sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.demean(mask=base.percentile_between(1, 99)) ``demean()`` is only supported on Factors of dtype float64. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: row - nanmean(row), factor=self, mask=mask, groupby=groupby, )
def vectorized_beta(dependents, independent, allowed_missing, out=None): """ Compute slopes of linear regressions between columns of ``dependents`` and ``independent``. Parameters ---------- dependents : np.array[N, M] Array with columns of data to be regressed against ``independent``. independent : np.array[N, 1] Independent variable of the regression allowed_missing : int Number of allowed missing (NaN) observations per column. Columns with more than this many non-nan observations in both ``dependents`` and ``independents`` will output NaN as the regression coefficient. Returns ------- slopes : np.array[M] Linear regression coefficients for each column of ``dependents``. """ # Cache these as locals since we're going to call them multiple times. nan = np.nan isnan = np.isnan N, M = dependents.shape if out is None: out = np.full(M, nan) # Copy N times as a column vector and fill with nans to have the same # missing value pattern as the dependent variable. # # PERF_TODO: We could probably avoid the space blowup by doing this in # Cython. # shape: (N, M) independent = np.where( isnan(dependents), nan, independent, ) # Calculate beta as Cov(X, Y) / Cov(X, X). # https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line # noqa # # NOTE: The usual formula for covariance is:: # # mean((X - mean(X)) * (Y - mean(Y))) # # However, we don't actually need to take the mean of both sides of the # product, because of the folllowing equivalence:: # # Let X_res = (X - mean(X)). # We have: # # mean(X_res * (Y - mean(Y))) = mean(X_res * (Y - mean(Y))) # (1) = mean((X_res * Y) - (X_res * mean(Y))) # (2) = mean(X_res * Y) - mean(X_res * mean(Y)) # (3) = mean(X_res * Y) - mean(X_res) * mean(Y) # (4) = mean(X_res * Y) - 0 * mean(Y) # (5) = mean(X_res * Y) # # # The tricky step in the above derivation is step (4). We know that # mean(X_res) is zero because, for any X: # # mean(X - mean(X)) = mean(X) - mean(X) = 0. # # The upshot of this is that we only have to center one of `independent` # and `dependent` when calculating covariances. Since we need the centered # `independent` to calculate its variance in the next step, we choose to # center `independent`. # shape: (N, M) ind_residual = independent - nanmean(independent, axis=0) # shape: (M,) covariances = nanmean(ind_residual * dependents, axis=0) # We end up with different variances in each column here because each # column may have a different subset of the data dropped due to missing # data in the corresponding dependent column. # shape: (M,) independent_variances = nanmean(ind_residual ** 2, axis=0) # shape: (M,) np.divide(covariances, independent_variances, out=out) # Write nans back to locations where we have more then allowed number of # missing entries. nanlocs = isnan(independent).sum(axis=0) > allowed_missing out[nanlocs] = nan return out
def demean(row): return row - nanmean(row)
class FactorTestCase(BasePipelineTestCase): def init_instance_fixtures(self): super(FactorTestCase, self).init_instance_fixtures() self.f = F() def test_bad_input(self): with self.assertRaises(UnknownRankMethod): self.f.rank("not a real rank method") @parameter_space(method_name=['isnan', 'notnan', 'isfinite']) def test_float64_only_ops(self, method_name): class NotFloat(Factor): dtype = datetime64ns_dtype inputs = () window_length = 0 nf = NotFloat() meth = getattr(nf, method_name) with self.assertRaises(TypeError): meth() @parameter_space(custom_missing_value=[-1, 0]) def test_isnull_int_dtype(self, custom_missing_value): class CustomMissingValue(Factor): dtype = int64_dtype window_length = 0 missing_value = custom_missing_value inputs = () factor = CustomMissingValue() data = arange(25).reshape(5, 5) data[eye(5, dtype=bool)] = custom_missing_value graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool)) def test_isnull_datetime_dtype(self): class DatetimeFactor(Factor): dtype = datetime64ns_dtype window_length = 0 inputs = () factor = DatetimeFactor() data = arange(25).reshape(5, 5).astype('datetime64[ns]') data[eye(5, dtype=bool)] = NaTns graph = TermGraph({ 'isnull': factor.isnull(), 'notnull': factor.notnull(), }) results = self.run_graph( graph, initial_workspace={factor: data}, mask=self.build_mask(ones((5, 5))), ) check_arrays(results['isnull'], eye(5, dtype=bool)) check_arrays(results['notnull'], ~eye(5, dtype=bool)) @for_each_factor_dtype def test_rank_ascending(self, name, factor_dtype): f = F(dtype=factor_dtype) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) expected_ranks = { 'ordinal': array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), 'average': array([[1.5, 3., 4., 5., 1.5], [2.5, 4., 5., 1., 2.5], [3.5, 5., 1., 2., 3.5], [4.5, 1., 2., 3., 4.5], [1.5, 3., 4., 5., 1.5]]), 'min': array([[1., 3., 4., 5., 1.], [2., 4., 5., 1., 2.], [3., 5., 1., 2., 3.], [4., 1., 2., 3., 4.], [1., 3., 4., 5., 1.]]), 'max': array([[2., 3., 4., 5., 2.], [3., 4., 5., 1., 3.], [4., 5., 1., 2., 4.], [5., 1., 2., 3., 5.], [2., 3., 4., 5., 2.]]), 'dense': array([[1., 2., 3., 4., 1.], [2., 3., 4., 1., 2.], [3., 4., 1., 2., 3.], [4., 1., 2., 3., 4.], [1., 2., 3., 4., 1.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method]) check({meth: f.rank(method=meth) for meth in expected_ranks}) check({ meth: f.rank(method=meth, ascending=True) for meth in expected_ranks }) # Not passing a method should default to ordinal. check({'ordinal': f.rank()}) check({'ordinal': f.rank(ascending=True)}) @for_each_factor_dtype def test_rank_descending(self, name, factor_dtype): f = F(dtype=factor_dtype) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) expected_ranks = { 'ordinal': array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), 'average': array([[4.5, 3., 2., 1., 4.5], [3.5, 2., 1., 5., 3.5], [2.5, 1., 5., 4., 2.5], [1.5, 5., 4., 3., 1.5], [4.5, 3., 2., 1., 4.5]]), 'min': array([[4., 3., 2., 1., 4.], [3., 2., 1., 5., 3.], [2., 1., 5., 4., 2.], [1., 5., 4., 3., 1.], [4., 3., 2., 1., 4.]]), 'max': array([[5., 3., 2., 1., 5.], [4., 2., 1., 5., 4.], [3., 1., 5., 4., 3.], [2., 5., 4., 3., 2.], [5., 3., 2., 1., 5.]]), 'dense': array([[4., 3., 2., 1., 4.], [3., 2., 1., 4., 3.], [2., 1., 4., 3., 2.], [1., 4., 3., 2., 1.], [4., 3., 2., 1., 4.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={f: data}, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_ranks[method]) check({ meth: f.rank(method=meth, ascending=False) for meth in expected_ranks }) # Not passing a method should default to ordinal. check({'ordinal': f.rank(ascending=False)}) @for_each_factor_dtype def test_rank_after_mask(self, name, factor_dtype): f = F(dtype=factor_dtype) # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) mask_data = ~eye(5, dtype=bool) initial_workspace = {f: data, Mask(): mask_data} graph = TermGraph({ "ascending_nomask": f.rank(ascending=True), "ascending_mask": f.rank(ascending=True, mask=Mask()), "descending_nomask": f.rank(ascending=False), "descending_mask": f.rank(ascending=False, mask=Mask()), }) expected = { "ascending_nomask": array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.], [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.], [1., 3., 4., 5., 2.]]), "descending_nomask": array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.], [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.], [4., 3., 2., 1., 5.]]), # Diagonal should be all nans, and anything whose rank was less # than the diagonal in the unmasked calc should go down by 1. "ascending_mask": array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.], [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.], [1., 2., 3., 4., nan]]), "descending_mask": array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.], [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.], [4., 3., 2., 1., nan]]), } results = self.run_graph( graph, initial_workspace, mask=self.build_mask(ones((5, 5))), ) for method in results: check_arrays(expected[method], results[method]) @for_each_factor_dtype def test_grouped_rank_ascending(self, name, factor_dtype=float64_dtype): f = F(dtype=factor_dtype) c = C() str_c = C(dtype=categorical_dtype, missing_value=None) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) # Generated with: # classifier_data = arange(25).reshape(5, 5).transpose() % 2 classifier_data = array( [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]], dtype=int64_dtype) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) expected_grouped_ranks = { 'ordinal': array([[1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.], [2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.], [1., 1., 3., 2., 2.]]), 'average': array([[1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5], [2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5], [1.5, 1., 3., 2., 1.5]]), 'min': array([[1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.], [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 3., 2., 1.]]), 'max': array([[2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.], [3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.], [2., 1., 3., 2., 2.]]), 'dense': array([[1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.], [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 2., 2., 1.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method]) # Not specifying the value of ascending param should default to True check({ meth: f.rank(method=meth, groupby=c) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=c, ascending=True) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c, ascending=True) for meth in expected_grouped_ranks }) # Not passing a method should default to ordinal check({'ordinal': f.rank(groupby=c)}) check({'ordinal': f.rank(groupby=str_c)}) check({'ordinal': f.rank(groupby=c, ascending=True)}) check({'ordinal': f.rank(groupby=str_c, ascending=True)}) @for_each_factor_dtype def test_grouped_rank_descending(self, name, factor_dtype): f = F(dtype=factor_dtype) c = C() str_c = C(dtype=categorical_dtype, missing_value=None) # Generated with: # data = arange(25).reshape(5, 5).transpose() % 4 data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=factor_dtype) # Generated with: # classifier_data = arange(25).reshape(5, 5).transpose() % 2 classifier_data = array( [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]], dtype=int64_dtype) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) expected_grouped_ranks = { 'ordinal': array([[2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.], [1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.], [2., 2., 1., 1., 3.]]), 'average': array([[2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5], [1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5], [2.5, 2., 1., 1., 2.5]]), 'min': array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.], [2., 2., 1., 1., 2.]]), 'max': array([[3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.], [2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.], [3., 2., 1., 1., 3.]]), 'dense': array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.], [1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.], [2., 2., 1., 1., 2.]]), } def check(terms): graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: data, c: classifier_data, str_c: string_classifier_data, }, mask=self.build_mask(ones((5, 5))), ) for method in terms: check_arrays(results[method], expected_grouped_ranks[method]) check({ meth: f.rank(method=meth, groupby=c, ascending=False) for meth in expected_grouped_ranks }) check({ meth: f.rank(method=meth, groupby=str_c, ascending=False) for meth in expected_grouped_ranks }) # Not passing a method should default to ordinal check({'ordinal': f.rank(groupby=c, ascending=False)}) check({'ordinal': f.rank(groupby=str_c, ascending=False)}) @parameterized.expand([ # Test cases computed by doing: # from numpy.random import seed, randn # from talib import RSI # seed(seed_value) # data = abs(randn(15, 3)) # expected = [RSI(data[:, i])[-1] for i in range(3)] (100, array([41.032913785966, 51.553585468393, 51.022005016446])), (101, array([43.506969935466, 46.145367530182, 50.57407044197])), (102, array([46.610102205934, 47.646892444315, 52.13182788538])), ]) def test_rsi(self, seed_value, expected): rsi = RSI() today = datetime64(1, 'ns') assets = arange(3) out = empty((3, ), dtype=float) seed(seed_value) # Seed so we get deterministic results. test_data = abs(randn(15, 3)) out = empty((3, ), dtype=float) rsi.compute(today, assets, out, test_data) check_allclose(expected, out) @parameterized.expand([ (100, 15), (101, 4), (102, 100), ]) def test_returns(self, seed_value, window_length): returns = Returns(window_length=window_length) today = datetime64(1, 'ns') assets = arange(3) out = empty((3, ), dtype=float) seed(seed_value) # Seed so we get deterministic results. test_data = abs(randn(window_length, 3)) # Calculate the expected returns expected = (test_data[-1] - test_data[0]) / test_data[0] out = empty((3, ), dtype=float) returns.compute(today, assets, out, test_data) check_allclose(expected, out) def gen_ranking_cases(): seeds = range(int(1e4), int(1e5), int(1e4)) methods = ('ordinal', 'average') use_mask_values = (True, False) set_missing_values = (True, False) ascending_values = (True, False) return product( seeds, methods, use_mask_values, set_missing_values, ascending_values, ) @parameterized.expand(gen_ranking_cases()) def test_masked_rankdata_2d(self, seed_value, method, use_mask, set_missing, ascending): eyemask = ~eye(5, dtype=bool) nomask = ones((5, 5), dtype=bool) seed(seed_value) asfloat = (randn(5, 5) * seed_value) asdatetime = (asfloat).copy().view('datetime64[ns]') mask = eyemask if use_mask else nomask if set_missing: asfloat[:, 2] = nan asdatetime[:, 2] = NaTns float_result = masked_rankdata_2d( data=asfloat, mask=mask, missing_value=nan, method=method, ascending=True, ) datetime_result = masked_rankdata_2d( data=asdatetime, mask=mask, missing_value=NaTns, method=method, ascending=True, ) check_arrays(float_result, datetime_result) def test_normalizations_hand_computed(self): """ Test the hand-computed example in factor.demean. """ f = self.f m = Mask() c = C() str_c = C(dtype=categorical_dtype, missing_value=None) factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0], [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], ) filter_data = array( [[False, True, True, True], [True, False, True, True], [True, True, False, True], [True, True, True, False]], dtype=bool, ) classifier_data = array( [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], dtype=int64_dtype, ) string_classifier_data = LabelArray( classifier_data.astype(str).astype(object), missing_value=None, ) terms = { 'vanilla': f.demean(), 'masked': f.demean(mask=m), 'grouped': f.demean(groupby=c), 'grouped_str': f.demean(groupby=str_c), 'grouped_masked': f.demean(mask=m, groupby=c), 'grouped_masked_str': f.demean(mask=m, groupby=str_c), } expected = { 'vanilla': array([[-1.500, -0.500, 0.500, 1.500], [-0.625, 0.375, 1.375, -1.125], [-0.625, 0.375, 1.375, -1.125], [0.250, 1.250, -1.250, -0.250]], ), 'masked': array( [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000], [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ), 'grouped': array([[-0.500, 0.500, -0.500, 0.500], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, 1.250, -1.250], [-0.500, 0.500, -0.500, 0.500]], ), 'grouped_masked': array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250], [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]]) } # Changing the classifier dtype shouldn't affect anything. expected['grouped_str'] = expected['grouped'] expected['grouped_masked_str'] = expected['grouped_masked'] graph = TermGraph(terms) results = self.run_graph( graph, initial_workspace={ f: factor_data, c: classifier_data, str_c: string_classifier_data, m: filter_data, }, mask=self.build_mask(self.ones_mask(shape=factor_data.shape)), ) for key, (res, exp) in dzip_exact(results, expected).items(): check_allclose( res, exp, # The hand-computed values aren't very precise (in particular, # we truncate repeating decimals at 3 places) This is just # asserting that the example isn't misleading by being totally # wrong. atol=0.001, err_msg="Mismatch for %r" % key) @parameter_space( seed_value=range(1, 2), normalizer_name_and_func=[ ('demean', lambda row: row - nanmean(row)), ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)), ], add_nulls_to_factor=( False, True, ), ) def test_normalizations_randomized(self, seed_value, normalizer_name_and_func, add_nulls_to_factor): name, func = normalizer_name_and_func shape = (7, 7) # All Trues. nomask = self.ones_mask(shape=shape) # Falses on main diagonal. eyemask = self.eye_mask(shape=shape) # Falses on other diagonal. eyemask90 = rot90(eyemask) # Falses on both diagonals. xmask = eyemask & eyemask90 # Block of random data. factor_data = self.randn_data(seed=seed_value, shape=shape) if add_nulls_to_factor: factor_data = where(eyemask, factor_data, nan) # Cycles of 0, 1, 2, 0, 1, 2, ... classifier_data = ( (self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) % 3) # With -1s on main diagonal. classifier_data_eyenulls = where(eyemask, classifier_data, -1) # With -1s on opposite diagonal. classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1) # With -1s on both diagonals. classifier_data_xnulls = where(xmask, classifier_data, -1) f = self.f c = C() c_with_nulls = OtherC() m = Mask() method = getattr(f, name) terms = { 'vanilla': method(), 'masked': method(mask=m), 'grouped': method(groupby=c), 'grouped_with_nulls': method(groupby=c_with_nulls), 'both': method(mask=m, groupby=c), 'both_with_nulls': method(mask=m, groupby=c_with_nulls), } expected = { 'vanilla': apply_along_axis( func, 1, factor_data, ), 'masked': where( eyemask, grouped_apply(factor_data, eyemask, func), nan, ), 'grouped': grouped_apply( factor_data, classifier_data, func, ), # If the classifier has nulls, we should get NaNs in the # corresponding locations in the output. 'grouped_with_nulls': where( eyemask90, grouped_apply(factor_data, classifier_data_eyenulls90, func), nan, ), # Passing a mask with a classifier should behave as though the # classifier had nulls where the mask was False. 'both': where( eyemask, grouped_apply( factor_data, classifier_data_eyenulls, func, ), nan, ), 'both_with_nulls': where( xmask, grouped_apply( factor_data, classifier_data_xnulls, func, ), nan, ) } self.check_terms( terms=terms, expected=expected, initial_workspace={ f: factor_data, c: classifier_data, c_with_nulls: classifier_data_eyenulls90, Mask(): eyemask, }, mask=self.build_mask(nomask), ) @parameter_space(method_name=['demean', 'zscore']) def test_cant_normalize_non_float(self, method_name): class DateFactor(Factor): dtype = datetime64ns_dtype inputs = () window_length = 0 d = DateFactor() with self.assertRaises(TypeError) as e: getattr(d, method_name)() errmsg = str(e.exception) expected = ( "{normalizer}() is only defined on Factors of dtype float64," " but it was called on a Factor of dtype datetime64[ns].").format( normalizer=method_name) self.assertEqual(errmsg, expected) @parameter_space(seed=[1, 2, 3]) def test_quantiles_unmasked(self, seed): permute = partial(permute_rows, seed) shape = (6, 6) # Shuffle the input rows to verify that we don't depend on the order. # Take the log to ensure that we don't depend on linear scaling or # integrality of inputs factor_data = permute(log1p(arange(36, dtype=float).reshape(shape))) f = self.f # Apply the same shuffle we applied to the input rows to our # expectations. Doing it this way makes it obvious that our # expectation corresponds to our input, while still testing against # a range of input orderings. permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '2': f.quantiles(bins=2), '3': f.quantiles(bins=3), '6': f.quantiles(bins=6), }, initial_workspace={ f: factor_data, }, expected={ # The values in the input are all increasing, so the first half # of each row should be in the bottom bucket, and the second # half should be in the top bucket. '2': permuted_array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]]), # Similar for three buckets. '3': permuted_array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]), # In the limiting case, we just have every column different. '6': permuted_array([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) @parameter_space(seed=[1, 2, 3]) def test_quantiles_masked(self, seed): permute = partial(permute_rows, seed) # 7 x 7 so that we divide evenly into 2/3/6-tiles after including the # nan value in each row. shape = (7, 7) # Shuffle the input rows to verify that we don't depend on the order. # Take the log to ensure that we don't depend on linear scaling or # integrality of inputs factor_data = permute(log1p(arange(49, dtype=float).reshape(shape))) factor_data_w_nans = where( permute(rot90(self.eye_mask(shape=shape))), factor_data, nan, ) mask_data = permute(self.eye_mask(shape=shape)) f = F() f_nans = OtherF() m = Mask() # Apply the same shuffle we applied to the input rows to our # expectations. Doing it this way makes it obvious that our # expectation corresponds to our input, while still testing against # a range of input orderings. permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '2_masked': f.quantiles(bins=2, mask=m), '3_masked': f.quantiles(bins=3, mask=m), '6_masked': f.quantiles(bins=6, mask=m), '2_nans': f_nans.quantiles(bins=2), '3_nans': f_nans.quantiles(bins=3), '6_nans': f_nans.quantiles(bins=6), }, initial_workspace={ f: factor_data, f_nans: factor_data_w_nans, m: mask_data, }, expected={ # Expected results here are the same as in # test_quantiles_unmasked, except with diagonals of -1s # interpolated to match the effects of masking and/or input # nans. '2_masked': permuted_array([[-1, 0, 0, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1], [0, 0, -1, 0, 1, 1, 1], [0, 0, 0, -1, 1, 1, 1], [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, 1, 1, -1, 1], [0, 0, 0, 1, 1, 1, -1]]), '3_masked': permuted_array([[-1, 0, 0, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2], [0, 0, -1, 1, 1, 2, 2], [0, 0, 1, -1, 1, 2, 2], [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, 1, 2, -1, 2], [0, 0, 1, 1, 2, 2, -1]]), '6_masked': permuted_array([[-1, 0, 1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5], [0, 1, -1, 2, 3, 4, 5], [0, 1, 2, -1, 3, 4, 5], [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, 3, 4, -1, 5], [0, 1, 2, 3, 4, 5, -1]]), '2_nans': permuted_array([[0, 0, 0, 1, 1, 1, -1], [0, 0, 0, 1, 1, -1, 1], [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, -1, 1, 1, 1], [0, 0, -1, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1], [-1, 0, 0, 0, 1, 1, 1]]), '3_nans': permuted_array([[0, 0, 1, 1, 2, 2, -1], [0, 0, 1, 1, 2, -1, 2], [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, -1, 1, 2, 2], [0, 0, -1, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2], [-1, 0, 0, 1, 1, 2, 2]]), '6_nans': permuted_array([[0, 1, 2, 3, 4, 5, -1], [0, 1, 2, 3, 4, -1, 5], [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, -1, 3, 4, 5], [0, 1, -1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5], [-1, 0, 1, 2, 3, 4, 5]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) def test_quantiles_uneven_buckets(self): permute = partial(permute_rows, 5) shape = (5, 5) factor_data = permute(log1p(arange(25, dtype=float).reshape(shape))) mask_data = permute(self.eye_mask(shape=shape)) f = F() m = Mask() permuted_array = compose(permute, partial(array, dtype=int64_dtype)) self.check_terms( terms={ '3_masked': f.quantiles(bins=3, mask=m), '7_masked': f.quantiles(bins=7, mask=m), }, initial_workspace={ f: factor_data, m: mask_data, }, expected={ '3_masked': permuted_array([[-1, 0, 0, 1, 2], [0, -1, 0, 1, 2], [0, 0, -1, 1, 2], [0, 0, 1, -1, 2], [0, 0, 1, 2, -1]]), '7_masked': permuted_array([[-1, 0, 2, 4, 6], [0, -1, 2, 4, 6], [0, 2, -1, 4, 6], [0, 2, 4, -1, 6], [0, 2, 4, 6, -1]]), }, mask=self.build_mask(self.ones_mask(shape=shape)), ) def test_quantile_helpers(self): f = self.f m = Mask() self.assertIs(f.quartiles(), f.quantiles(bins=4)) self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m)) self.assertIsNot(f.quartiles(), f.quartiles(mask=m)) self.assertIs(f.quintiles(), f.quantiles(bins=5)) self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m)) self.assertIsNot(f.quintiles(), f.quintiles(mask=m)) self.assertIs(f.deciles(), f.quantiles(bins=10)) self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m)) self.assertIsNot(f.deciles(), f.deciles(mask=m))
def zscore(row): return (row - nanmean(row)) / nanstd(row)
def compute(self, today, assets, out, close, k): difference = k * nanstd(close, axis=0) out.middle = middle = nanmean(close, axis=0) out.upper = middle + difference out.lower = middle - difference
def vectorized_beta(dependents, independent, allowed_missing, out=None): """ Compute slopes of linear regressions between columns of ``dependents`` and ``independent``. Parameters ---------- dependents : np.array[N, M] Array with columns of data to be regressed against ``independent``. independent : np.array[N, 1] Independent variable of the regression allowed_missing : int Number of allowed missing (NaN) observations per column. Columns with more than this many non-nan observations in either ``dependents`` or ``independents`` will output NaN as the regression coefficient. out : np.array[M] or None, optional Output array into which to write results. If None, a new array is created and returned. Returns ------- slopes : np.array[M] Linear regression coefficients for each column of ``dependents``. """ # Cache these as locals since we're going to call them multiple times. nan = np.nan isnan = np.isnan N, M = dependents.shape if out is None: out = np.full(M, nan) # Copy N times as a column vector and fill with nans to have the same # missing value pattern as the dependent variable. # # PERF_TODO: We could probably avoid the space blowup by doing this in # Cython. # shape: (N, M) independent = np.where( isnan(dependents), nan, independent, ) # Calculate beta as Cov(X, Y) / Cov(X, X). # https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line # noqa # # NOTE: The usual formula for covariance is:: # # mean((X - mean(X)) * (Y - mean(Y))) # # However, we don't actually need to take the mean of both sides of the # product, because of the folllowing equivalence:: # # Let X_res = (X - mean(X)). # We have: # # mean(X_res * (Y - mean(Y))) = mean(X_res * (Y - mean(Y))) # (1) = mean((X_res * Y) - (X_res * mean(Y))) # (2) = mean(X_res * Y) - mean(X_res * mean(Y)) # (3) = mean(X_res * Y) - mean(X_res) * mean(Y) # (4) = mean(X_res * Y) - 0 * mean(Y) # (5) = mean(X_res * Y) # # # The tricky step in the above derivation is step (4). We know that # mean(X_res) is zero because, for any X: # # mean(X - mean(X)) = mean(X) - mean(X) = 0. # # The upshot of this is that we only have to center one of `independent` # and `dependent` when calculating covariances. Since we need the centered # `independent` to calculate its variance in the next step, we choose to # center `independent`. # shape: (N, M) ind_residual = independent - nanmean(independent, axis=0) # shape: (M,) covariances = nanmean(ind_residual * dependents, axis=0) # We end up with different variances in each column here because each # column may have a different subset of the data dropped due to missing # data in the corresponding dependent column. # shape: (M,) independent_variances = nanmean(ind_residual**2, axis=0) # shape: (M,) np.divide(covariances, independent_variances, out=out) # Write nans back to locations where we have more then allowed number of # missing entries. nanlocs = isnan(independent).sum(axis=0) > allowed_missing out[nanlocs] = nan return out