def test_single_factor(self): """ Test dependency resolution for a single factor. """ def check_output(graph): resolution_order = list(graph.ordered()) self.assertEqual(len(resolution_order), 3) self.assertEqual( set([resolution_order[0], resolution_order[1]]), set([SomeDataSet.foo, SomeDataSet.bar]), ) self.assertEqual(resolution_order[-1], SomeFactor()) self.assertEqual(graph.node[SomeDataSet.foo]['extra_rows'], 4) self.assertEqual(graph.node[SomeDataSet.bar]['extra_rows'], 4) for foobar in gen_equivalent_factors(): check_output(TermGraph(to_dict([foobar])))
def test_single_factor_instance_args(self): """ Test dependency resolution for a single factor with arguments passed to the constructor. """ bar, buzz = SomeDataSet.bar, SomeDataSet.buzz graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)])) resolution_order = list(graph.ordered()) self.assertEqual(len(resolution_order), 3) self.assertEqual( set([resolution_order[0], resolution_order[1]]), set([bar, buzz]), ) self.assertEqual( resolution_order[-1], SomeFactor([bar, buzz], window_length=5), ) self.assertEqual(graph.extra_rows[bar], 4) self.assertEqual(graph.extra_rows[buzz], 4)
def run_terms(self, terms, initial_workspace, mask=None): """ Compute the given terms, seeding the workspace of our FFCEngine with `initial_workspace`. Parameters ---------- terms : dict Mapping from termname -> term object. Returns ------- results : dict Mapping from termname -> computed result. """ engine = SimpleFFCEngine( ExplodingObject(), self.__calendar, self.__finder, ) mask = mask if mask is not None else self.__mask return engine.compute_chunk(TermGraph(terms), mask, initial_workspace)
def test_reuse_atomic_terms(self): """ Test that raw inputs only show up in the dependency graph once. """ f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar]) f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz]) graph = TermGraph(to_dict([f1, f2])) resolution_order = list(graph.ordered()) # bar should only appear once. self.assertEqual(len(resolution_order), 5) indices = { term: resolution_order.index(term) for term in resolution_order } # Verify that f1's dependencies will be computed before f1. self.assertLess(indices[SomeDataSet.foo], indices[f1]) self.assertLess(indices[SomeDataSet.bar], indices[f1]) # Verify that f2's dependencies will be computed before f2. self.assertLess(indices[SomeDataSet.bar], indices[f2]) self.assertLess(indices[SomeDataSet.buzz], indices[f2])
def factor_matrix(self, terms, start_date, end_date): """ Compute a factor matrix. Parameters ---------- terms : dict[str -> zipline.modelling.term.Term] Dict mapping term names to instances. The supplied names are used as column names in our output frame. start_date : pd.Timestamp Start date of the computed matrix. end_date : pd.Timestamp End date of the computed matrix. The algorithm implemented here can be broken down into the following stages: 0. Build a dependency graph of all terms in `terms`. Topologically sort the graph to determine an order in which we can compute the terms. 1. Ask our AssetFinder for a "lifetimes matrix", which should contain, for each date between start_date and end_date, a boolean value for each known asset indicating whether the asset existed on that date. 2. Compute each term in the dependency order determined in (0), caching the results in a a dictionary to that they can be fed into future terms. 3. For each date, determine the number of assets passing **all** filters. The sum, N, of all these values is the total number of rows in our output frame, so we pre-allocate an output array of length N for each factor in `terms`. 4. Fill in the arrays allocated in (3) by copying computed values from our output cache into the corresponding rows. 5. Stick the values computed in (4) into a DataFrame and return it. Step 0 is performed by `zipline.modelling.graph.TermGraph`. Step 1 is performed in `self.build_lifetimes_matrix`. Step 2 is performed in `self.compute_chunk`. Steps 3, 4, and 5 are performed in self._format_factor_matrix. See Also -------- FFCEngine.factor_matrix """ if end_date <= start_date: raise ValueError("start_date must be before end_date \n" "start_date=%s, end_date=%s" % (start_date, end_date)) graph = TermGraph(terms) max_extra_rows = graph.max_extra_rows lifetimes = self.build_lifetimes_matrix( start_date, end_date, max_extra_rows, ) raw_outputs = self.compute_chunk(graph, lifetimes, {}) lifetimes_between_dates = lifetimes[max_extra_rows:] dates = lifetimes_between_dates.index.values assets = lifetimes_between_dates.columns.values # We only need filters and factors to compute the final output matrix. filters, factors = {}, {} for name, term in iteritems(terms): if isinstance(term, Filter): filters[name] = raw_outputs[name] elif isinstance(term, Factor): factors[name] = raw_outputs[name] elif isinstance(term, Classifier): continue else: raise ValueError("Unknown term type: %s" % term) # Treat base_mask as an implicit filter. # TODO: Is there a clean way to make this actually just be a filter? filters['base'] = lifetimes_between_dates.values return self._format_factor_matrix(dates, assets, filters, factors)