def unary_operator(op): """ Factory function for making unary operator methods for Factors. """ # Only negate is currently supported. valid_ops = {'-'} if op not in valid_ops: raise ValueError("Invalid unary operator %s." % op) @with_doc("Unary Operator: '%s'" % op) @with_name(unary_op_name(op)) def unary_operator(self): if self.dtype != float64_dtype: raise TypeError( "Can't apply unary operator {op!r} to instance of " "{typename!r} with dtype {dtypename!r}.\n" "{op!r} is only supported for Factors of dtype " "'float64'.".format( op=op, typename=type(self).__name__, dtypename=self.dtype.name, ) ) # This can't be hoisted up a scope because the types returned by # unary_op_return_type aren't defined when the top-level function is # invoked. if isinstance(self, NumericalExpression): return NumExprFactor( "{op}({expr})".format(op=op, expr=self._expr), self.inputs, dtype=float64_dtype, ) else: return NumExprFactor( "{op}x_0".format(op=op), (self,), dtype=float64_dtype, ) return unary_operator
class Factor(CompositeTerm): """ Pipeline API expression producing numerically-valued outputs. """ # Dynamically add functions for creating NumExprFactor/NumExprFilter # instances. clsdict = locals() clsdict.update({ method_name_for_op(op): binary_operator(op) # Don't override __eq__ because it breaks comparisons on tuples of # Factors. for op in MATH_BINOPS.union(COMPARISONS - {'=='}) }) clsdict.update({ method_name_for_op(op, commute=True): reflected_binary_operator(op) for op in MATH_BINOPS }) clsdict.update({unary_op_name(op): unary_operator(op) for op in UNARY_OPS}) clsdict.update({ funcname: function_application(funcname) for funcname in NUMEXPR_MATH_FUNCS }) __truediv__ = clsdict['__div__'] __rtruediv__ = clsdict['__rdiv__'] eq = binary_operator('==') def _validate(self): # Do superclass validation first so that `NotSpecified` dtypes get # handled. retval = super(Factor, self)._validate() if self.dtype not in FACTOR_DTYPES: raise UnsupportedDataType(typename=type(self).__name__, dtype=self.dtype) return retval def rank(self, method='ordinal', ascending=True, mask=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. Parameters ---------- method : str, {'ordinal', 'min', 'max', 'dense', 'average'} The method used to assign ranks to tied elements. See `scipy.stats.rankdata` for a full description of the semantics for each ranking method. Default is 'ordinal'. ascending : bool, optional Whether to return sorted rank in ascending or descending order. Default is True. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, ranks are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- ranks : zipline.pipeline.factors.Rank A new factor that will compute the ranking of the data produced by `self`. Notes ----- The default value for `method` is different from the default for `scipy.stats.rankdata`. See that function's documentation for a full description of the valid inputs to `method`. Missing or non-existent data on a given day will cause an asset to be given a rank of NaN for that day. See Also -------- scipy.stats.rankdata zipline.lib.rank.masked_rankdata_2d zipline.pipeline.factors.factor.Rank """ return Rank(self, method=method, ascending=ascending, mask=mask) def top(self, N, mask=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, top values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.filters.Filter """ return self.rank(ascending=False, mask=mask) <= N def bottom(self, N, mask=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, bottom values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.Filter """ return self.rank(ascending=True, mask=mask) <= N def percentile_between(self, min_percentile, max_percentile, mask=NotSpecified): """ Construct a new Filter representing entries from the output of this Factor that fall within the percentile range defined by min_percentile and max_percentile. Parameters ---------- min_percentile : float [0.0, 100.0] Return True for assets falling above this percentile in the data. max_percentile : float [0.0, 100.0] Return True for assets falling below this percentile in the data. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when percentile thresholds. If mask is supplied, percentile cutoffs are computed each day using only assets for which `mask` returns True, and assets not passing `mask` will produce False in the output of this filter as well. Returns ------- out : zipline.pipeline.filters.PercentileFilter A new filter that will compute the specified percentile-range mask. See Also -------- zipline.pipeline.filters.filter.PercentileFilter """ return PercentileFilter( self, min_percentile=min_percentile, max_percentile=max_percentile, mask=mask, ) def isnan(self): """ A Filter producing True for all values where this Factor is NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return self != self def notnan(self): """ A Filter producing True for values where this Factor is not NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return ~self.isnan() def isfinite(self): """ A Filter producing True for values where this Factor is anything but NaN, inf, or -inf. """ return (-inf < self) & (self < inf)
class Factor(RestrictedDTypeMixin, ComputableTerm): """ Pipeline API expression producing a numerical or date-valued output. Factors are the most commonly-used Pipeline term, representing the result of any computation producing a numerical result. Factors can be combined, both with other Factors and with scalar values, via any of the builtin mathematical operators (``+``, ``-``, ``*``, etc). This makes it easy to write complex expressions that combine multiple Factors. For example, constructing a Factor that computes the average of two other Factors is simply:: >>> f1 = SomeFactor(...) >>> f2 = SomeOtherFactor(...) >>> average = (f1 + f2) / 2.0 Factors can also be converted into :class:`zipline.pipeline.Filter` objects via comparison operators: (``<``, ``<=``, ``!=``, ``eq``, ``>``, ``>=``). There are many natural operators defined on Factors besides the basic numerical operators. These include methods identifying missing or extreme-valued outputs (isnull, notnull, isnan, notnan), methods for normalizing outputs (rank, demean, zscore), and methods for constructing Filters based on rank-order properties of results (top, bottom, percentile_between). """ ALLOWED_DTYPES = FACTOR_DTYPES # Used by RestrictedDTypeMixin # Dynamically add functions for creating NumExprFactor/NumExprFilter # instances. clsdict = locals() clsdict.update({ method_name_for_op(op): binary_operator(op) # Don't override __eq__ because it breaks comparisons on tuples of # Factors. for op in MATH_BINOPS.union(COMPARISONS - {'=='}) }) clsdict.update({ method_name_for_op(op, commute=True): reflected_binary_operator(op) for op in MATH_BINOPS }) clsdict.update({unary_op_name(op): unary_operator(op) for op in UNARY_OPS}) clsdict.update({ funcname: function_application(funcname) for funcname in NUMEXPR_MATH_FUNCS }) __truediv__ = clsdict['__div__'] __rtruediv__ = clsdict['__rdiv__'] eq = binary_operator('==') @expect_types( mask=(Filter, NotSpecifiedType), groupby=(Classifier, NotSpecifiedType), ) @float64_only def demean(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that computes ``self`` and subtracts the mean from row of the result. If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, de-meaning the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when computing means. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute means. Example ------- Let ``f`` be a Factor which would produce the following output:: AAPL MSFT MCD BK 2017-03-13 1.0 2.0 3.0 4.0 2017-03-14 1.5 2.5 3.5 1.0 2017-03-15 2.0 3.0 4.0 1.5 2017-03-16 2.5 3.5 1.0 2.0 Let ``c`` be a Classifier producing the following output:: AAPL MSFT MCD BK 2017-03-13 1 1 2 2 2017-03-14 1 1 2 2 2017-03-15 1 1 2 2 2017-03-16 1 1 2 2 Let ``m`` be a Filter producing the following output:: AAPL MSFT MCD BK 2017-03-13 False True True True 2017-03-14 True False True True 2017-03-15 True True False True 2017-03-16 True True True False Then ``f.demean()`` will subtract the mean from each row produced by ``f``. :: AAPL MSFT MCD BK 2017-03-13 -1.500 -0.500 0.500 1.500 2017-03-14 -0.625 0.375 1.375 -1.125 2017-03-15 -0.625 0.375 1.375 -1.125 2017-03-16 0.250 1.250 -1.250 -0.250 ``f.demean(mask=m)`` will subtract the mean from each row, but means will be calculated ignoring values on the diagonal, and NaNs will written to the diagonal in the output. Diagonal values are ignored because they are the locations where the mask ``m`` produced False. :: AAPL MSFT MCD BK 2017-03-13 NaN -1.000 0.000 1.000 2017-03-14 -0.500 NaN 1.500 -1.000 2017-03-15 -0.166 0.833 NaN -0.666 2017-03-16 0.166 1.166 -1.333 NaN ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and MCD/BK from their respective entries. The AAPL/MSFT are grouped together because both assets always produce 1 in the output of the classifier ``c``. Similarly, MCD/BK are grouped together because they always produce 2. :: AAPL MSFT MCD BK 2017-03-13 -0.500 0.500 -0.500 0.500 2017-03-14 -0.500 0.500 1.250 -1.250 2017-03-15 -0.500 0.500 1.250 -1.250 2017-03-16 -0.500 0.500 -0.500 0.500 ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on the diagonal , and NaNs will be written to the diagonal in the output. :: AAPL MSFT MCD BK 2017-03-13 NaN 0.000 -0.500 0.500 2017-03-14 0.000 NaN 1.250 -1.250 2017-03-15 -0.500 0.500 NaN 0.000 2017-03-16 -0.500 0.500 0.000 NaN Notes ----- Mean is sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.demean(mask=base.percentile_between(1, 99)) ``demean()`` is only supported on Factors of dtype float64. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: row - nanmean(row), factor=self, mask=mask, groupby=groupby, ) @expect_types( mask=(Filter, NotSpecifiedType), groupby=(Classifier, NotSpecifiedType), ) @float64_only def zscore(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that Z-Scores each day's results. The Z-Score of a row is defined as:: (row - row.mean()) / row.stddev() If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means and standard deviations, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, z-scoring the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when Z-Scoring. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute Z-Scores. Returns ------- zscored : zipline.pipeline.Factor A Factor producing that z-scores the output of self. Notes ----- Mean and standard deviation are sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.zscore(mask=base.percentile_between(1, 99)) ``zscore()`` is only supported on Factors of dtype float64. Example ------- See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth example of the semantics for ``mask`` and ``groupby``. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: (row - nanmean(row)) / nanstd(row), factor=self, mask=mask, groupby=groupby, ) def rank(self, method='ordinal', ascending=True, mask=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. Parameters ---------- method : str, {'ordinal', 'min', 'max', 'dense', 'average'} The method used to assign ranks to tied elements. See `scipy.stats.rankdata` for a full description of the semantics for each ranking method. Default is 'ordinal'. ascending : bool, optional Whether to return sorted rank in ascending or descending order. Default is True. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, ranks are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- ranks : zipline.pipeline.factors.Rank A new factor that will compute the ranking of the data produced by `self`. Notes ----- The default value for `method` is different from the default for `scipy.stats.rankdata`. See that function's documentation for a full description of the valid inputs to `method`. Missing or non-existent data on a given day will cause an asset to be given a rank of NaN for that day. See Also -------- :func:`scipy.stats.rankdata` :class:`zipline.pipeline.factors.factor.Rank` """ return Rank(self, method=method, ascending=ascending, mask=mask) def top(self, N, mask=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, top values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.filters.Filter """ return self.rank(ascending=False, mask=mask) <= N def bottom(self, N, mask=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, bottom values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.Filter """ return self.rank(ascending=True, mask=mask) <= N def percentile_between(self, min_percentile, max_percentile, mask=NotSpecified): """ Construct a new Filter representing entries from the output of this Factor that fall within the percentile range defined by min_percentile and max_percentile. Parameters ---------- min_percentile : float [0.0, 100.0] Return True for assets falling above this percentile in the data. max_percentile : float [0.0, 100.0] Return True for assets falling below this percentile in the data. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when percentile calculating thresholds. If mask is supplied, percentile cutoffs are computed each day using only assets for which ``mask`` returns True. Assets for which ``mask`` produces False will produce False in the output of this Factor as well. Returns ------- out : zipline.pipeline.filters.PercentileFilter A new filter that will compute the specified percentile-range mask. See Also -------- zipline.pipeline.filters.filter.PercentileFilter """ return PercentileFilter( self, min_percentile=min_percentile, max_percentile=max_percentile, mask=mask, ) def isnull(self): """ A Filter producing True for values where this Factor has missing data. Equivalent to self.isnan() when ``self.dtype`` is float64. Otherwise equivalent to ``self.eq(self.missing_value)``. Returns ------- filter : zipline.pipeline.filters.Filter """ if self.dtype == float64_dtype: # Using isnan is more efficient when possible because we can fold # the isnan computation with other NumExpr expressions. return self.isnan() else: return NullFilter(self) def notnull(self): """ A Filter producing True for values where this Factor has complete data. Equivalent to ``~self.isnan()` when ``self.dtype`` is float64. Otherwise equivalent to ``(self != self.missing_value)``. """ return ~self.isnull() @if_not_float64_tell_caller_to_use_isnull def isnan(self): """ A Filter producing True for all values where this Factor is NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return self != self @if_not_float64_tell_caller_to_use_isnull def notnan(self): """ A Filter producing True for values where this Factor is not NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return ~self.isnan() @if_not_float64_tell_caller_to_use_isnull def isfinite(self): """ A Filter producing True for values where this Factor is anything but NaN, inf, or -inf. """ return (-inf < self) & (self < inf)