def intersection(df, pairwise=False, **subset_args): """ Counts the size of intersections of subsets of predicted examples. E.g. count the overlap between the top k of two different models Args: df: the result of to_dataframe(), Predict steps of length n_steps pairwise: when False, returns the mutual intersection between all subsets. Otherwise returns an n_steps x n_steps matrix whose i,j entry is the number of examples in the intersection between the i and j step subsets. **subset_args: arguments to be passed to model.y_subset() for each predict step Returns: the intersection, either an integer, if pairwise is False, or a DataFrame, otherwise. """ indexes = map(lambda row: set(model.y_subset(row[1].step.get_result()['y'], **subset_args).index), df.iterrows()) if not pairwise: return len(util.intersect(indexes)) else: r = pd.DataFrame(index=df.index, columns=xrange(len(df))) for i in xrange(len(df)): r.values[i][i] = len(indexes[i]) for j in xrange(i+1, len(df)): r.values[i][j] = len(indexes[i] & indexes[j]) return r
def test_subset_k(): assert set(y_subset(y, k=2).index) == set([1, 3])
def test_subset_dropna(): assert set(y_subset(y, dropna=True).index) == set([0, 1, 3, 4])
def test_subset_query(): assert set(y_subset(y, query="attr").index) == set([1])
def apply_y(df, fn, **kwargs): return apply(df, lambda s: fn(model.y_subset(s.get_result()['y'], **kwargs)))
def test_subset_query(): assert set(y_subset(y, query='attr').index) == set([1])