예제 #1
0
def test_not_aggregated(sort_by, sort_sets_by):
    # FIXME: this is not testing if aggregation used is count or sum
    kw = {'sort_by': sort_by, 'sort_sets_by': sort_sets_by}
    Xagg = generate_data(aggregated=True)
    intersections1, totals1 = _process_data(Xagg, **kw)
    Xunagg = generate_data()
    Xunagg.loc[:] = 1
    intersections2, totals2 = _process_data(Xunagg, **kw)
    assert_series_equal(intersections1, intersections2, check_dtype=False)
    assert_series_equal(totals1, totals2, check_dtype=False)
예제 #2
0
def test_not_aggregated(sort_by, sort_sets_by):
    # FIXME: this is not testing if aggregation used is count or sum
    kw = {'sort_by': sort_by, 'sort_sets_by': sort_sets_by, 'sum_over': None}
    Xagg = generate_data(aggregated=True)
    df1, intersections1, totals1 = _process_data(Xagg, **kw)
    Xunagg = generate_data()
    Xunagg.loc[:] = 1
    df2, intersections2, totals2 = _process_data(Xunagg, **kw)
    assert_series_equal(intersections1, intersections2,
                        check_dtype=False)
    assert_series_equal(totals1, totals2, check_dtype=False)
    assert set(df1.columns) == {'_value', '_bin'}
    assert set(df2.columns) == {'_value', '_bin'}
    assert len(df2) == len(Xunagg)
    assert df2['_bin'].nunique() == len(intersections2)
예제 #3
0
def test_process_data_series(x, sort_by, sort_categories_by):
    assert x.name == 'value'
    for subset_size in ['auto', 'sum', 'count']:
        for sum_over in ['abc', False]:
            with pytest.raises(ValueError, match='sum_over is not applicable'):
                _process_data(x,
                              sort_by=sort_by,
                              sort_categories_by=sort_categories_by,
                              subset_size=subset_size,
                              sum_over=sum_over)

    total, df, intersections, totals = _process_data(
        x,
        subset_size='auto',
        sort_by=sort_by,
        sort_categories_by=sort_categories_by,
        sum_over=None)

    assert total == x.sum()

    assert intersections.name == 'value'
    x_reordered = (x.reorder_levels(
        intersections.index.names).reindex(index=intersections.index))
    assert len(x) == len(x_reordered)
    assert x_reordered.index.is_unique
    assert_series_equal(x_reordered, intersections, check_dtype=False)

    if sort_by == 'cardinality':
        assert is_ascending(intersections.values[::-1])
    else:
        # check degree order
        assert is_ascending(intersections.index.to_frame().sum(axis=1))
        # TODO: within a same-degree group, the tuple of active names should
        #       be in sort-order
    if sort_categories_by:
        assert is_ascending(totals.values[::-1])

    assert np.all(totals.index.values == intersections.index.names)

    assert np.all(df.index.names == intersections.index.names)
    assert set(df.columns) == {'_value', '_bin'}
    assert_index_equal(df['_value'].reorder_levels(x.index.names).index,
                       x.index)
    assert_array_equal(df['_value'], x)
    assert_index_equal(intersections.iloc[df['_bin']].index, df.index)
    assert len(df) == len(x)
예제 #4
0
def test_not_unique(sort_by, sort_categories_by):
    kw = {
        'sort_by': sort_by,
        'sort_categories_by': sort_categories_by,
        'subset_size': 'sum',
        'sum_over': None
    }
    Xagg = generate_counts()
    df1, intersections1, totals1 = _process_data(Xagg, **kw)
    Xunagg = generate_samples()['value']
    Xunagg.loc[:] = 1
    df2, intersections2, totals2 = _process_data(Xunagg, **kw)
    assert_series_equal(intersections1, intersections2, check_dtype=False)
    assert_series_equal(totals1, totals2, check_dtype=False)
    assert set(df1.columns) == {'_value', '_bin'}
    assert set(df2.columns) == {'_value', '_bin'}
    assert len(df2) == len(Xunagg)
    assert df2['_bin'].nunique() == len(intersections2)
예제 #5
0
def test_subset_size_series(x):
    kw = {
        'sort_by': 'cardinality',
        'sort_categories_by': 'cardinality',
        'sum_over': None
    }
    total, df_sum, intersections_sum, totals_sum = _process_data(
        x, subset_size='sum', **kw)
    assert total == intersections_sum.sum()

    if x.index.is_unique:
        total, df, intersections, totals = _process_data(x,
                                                         subset_size='auto',
                                                         **kw)
        assert total == intersections.sum()
        assert_frame_equal(df, df_sum)
        assert_series_equal(intersections, intersections_sum)
        assert_series_equal(totals, totals_sum)
    else:
        with pytest.raises(ValueError):
            _process_data(x, subset_size='auto', **kw)

    total, df_count, intersections_count, totals_count = _process_data(
        x, subset_size='count', **kw)
    assert total == intersections_count.sum()
    total, df, intersections, totals = _process_data(
        x.groupby(level=list(range(len(x.index.levels)))).count(),
        subset_size='sum',
        **kw)
    assert total == intersections.sum()
    assert_series_equal(intersections, intersections_count, check_names=False)
    assert_series_equal(totals, totals_count)
예제 #6
0
def test_process_data_series(X, sort_by, sort_sets_by):
    with pytest.raises(ValueError, match='sum_over is not applicable'):
        _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by,
                      sum_over=False)

    df, intersections, totals = _process_data(X,
                                              sort_by=sort_by,
                                              sort_sets_by=sort_sets_by,
                                              sum_over=None)
    assert intersections.name == 'value'
    X_reordered = (X
                   .reorder_levels(intersections.index.names)
                   .reindex(index=intersections.index))
    assert len(X) == len(X_reordered)
    assert X_reordered.index.is_unique
    assert_series_equal(X_reordered, intersections,
                        check_dtype=False)

    if sort_by == 'cardinality':
        assert is_ascending(intersections.values[::-1])
    else:
        # check degree order
        assert is_ascending(intersections.index.to_frame().sum(axis=1))
        # TODO: within a same-degree group, the tuple of active names should
        #       be in sort-order
    if sort_sets_by:
        assert is_ascending(totals.values[::-1])

    assert np.all(totals.index.values == intersections.index.names)

    assert np.all(df.index.names == intersections.index.names)
    assert set(df.columns) == {'_value', '_bin'}
    assert_index_equal(df['_value'].reorder_levels(X.index.names).index,
                       X.index)
    assert_array_equal(df['_value'], X)
    assert_index_equal(intersections.iloc[df['_bin']].index,
                       df.index)
    assert len(df) == len(X)
예제 #7
0
def test_process_data_frame(x, sort_by, sort_categories_by):
    X = pd.DataFrame({'a': x})

    with pytest.warns(None):
        total, df, intersections, totals = _process_data(
            X, sort_by=sort_by, sort_categories_by=sort_categories_by,
            sum_over='a', subset_size='auto')
    assert df is not X
    assert total == intersections.sum()

    # check equivalence to Series
    total1, df1, intersections1, totals1 = _process_data(
        x, sort_by=sort_by, sort_categories_by=sort_categories_by,
        subset_size='sum', sum_over=None)

    assert intersections.name == 'a'
    assert_frame_equal(df, df1.rename(columns={'_value': 'a'}))
    assert_series_equal(intersections, intersections1, check_names=False)
    assert_series_equal(totals, totals1)

    # check effect of extra column
    X = pd.DataFrame({'a': x, 'b': np.arange(len(x))})
    total2, df2, intersections2, totals2 = _process_data(
        X, sort_by=sort_by, sort_categories_by=sort_categories_by,
        sum_over='a', subset_size='auto')
    assert total2 == intersections2.sum()
    assert_series_equal(intersections, intersections2)
    assert_series_equal(totals, totals2)
    assert_frame_equal(df, df2.drop('b', axis=1))
    assert_array_equal(df2['b'], X['b'])  # disregard levels, tested above

    # check effect not dependent on order/name
    X = pd.DataFrame({'b': np.arange(len(x)), 'c': x})
    total3, df3, intersections3, totals3 = _process_data(
        X, sort_by=sort_by, sort_categories_by=sort_categories_by,
        sum_over='c', subset_size='auto')
    assert total3 == intersections3.sum()
    assert_series_equal(intersections, intersections3, check_names=False)
    assert intersections.name == 'a'
    assert intersections3.name == 'c'
    assert_series_equal(totals, totals3)
    assert_frame_equal(df.rename(columns={'a': 'c'}), df3.drop('b', axis=1))
    assert_array_equal(df3['b'], X['b'])

    # check subset_size='count'
    X = pd.DataFrame({'b': np.ones(len(x), dtype=int), 'c': x})
    total4, df4, intersections4, totals4 = _process_data(
        X, sort_by=sort_by, sort_categories_by=sort_categories_by,
        sum_over='b', subset_size='auto')
    total5, df5, intersections5, totals5 = _process_data(
        X, sort_by=sort_by, sort_categories_by=sort_categories_by,
        subset_size='count', sum_over=None)
    assert total5 == intersections5.sum()
    assert_series_equal(intersections4, intersections5, check_names=False)
    assert intersections4.name == 'b'
    assert intersections5.name == 'size'
    assert_series_equal(totals4, totals5)
    assert_frame_equal(df4, df5)
예제 #8
0
def test_process_data(X, sort_by, sort_sets_by):
    intersections, totals = _process_data(X,
                                          sort_by=sort_by,
                                          sort_sets_by=sort_sets_by)
    X_reordered = (X.reorder_levels(
        intersections.index.names).reindex(index=intersections.index))
    assert len(X) == len(X_reordered)
    assert X_reordered.index.is_unique
    assert_series_equal(X_reordered, intersections, check_dtype=False)

    if sort_by == 'cardinality':
        assert is_ascending(intersections.values[::-1])
    else:
        # check degree order
        assert is_ascending(intersections.index.to_frame().sum(axis=1))
        # TODO: within a same-degree group, the tuple of active names should
        #       be in sort-order
    if sort_sets_by:
        assert is_ascending(totals.values[::-1])

    assert np.all(totals.index.values == intersections.index.names)
예제 #9
0
def test_subset_size_series(x):
    kw = {
        'sort_by': 'cardinality',
        'sort_categories_by': 'cardinality',
        'sum_over': None
    }
    df_sum, intersections_sum, totals_sum = _process_data(x,
                                                          subset_size='sum',
                                                          **kw)

    if x.index.is_unique:
        expected_warning = None
    else:
        expected_warning = FutureWarning
    with pytest.warns(expected_warning):
        df, intersections, totals = _process_data(x,
                                                  subset_size='legacy',
                                                  **kw)
    assert_frame_equal(df, df_sum)
    assert_series_equal(intersections, intersections_sum)
    assert_series_equal(totals, totals_sum)

    if x.index.is_unique:
        df, intersections, totals = _process_data(x, subset_size='auto', **kw)
        assert_frame_equal(df, df_sum)
        assert_series_equal(intersections, intersections_sum)
        assert_series_equal(totals, totals_sum)
    else:
        with pytest.raises(ValueError):
            _process_data(x, subset_size='auto', **kw)

    df_count, intersections_count, totals_count = _process_data(
        x, subset_size='count', **kw)
    df, intersections, totals = _process_data(
        x.groupby(level=list(range(len(x.index.levels)))).count(),
        subset_size='sum',
        **kw)
    assert_series_equal(intersections, intersections_count, check_names=False)
    assert_series_equal(totals, totals_count)
예제 #10
0
def test_process_data_frame(x, sort_by, sort_sets_by):
    X = pd.DataFrame({'a': x})

    with pytest.raises(ValueError, match='sum_over must be False or '):
        _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by,
                      sum_over=None)

    df, intersections, totals = _process_data(X,
                                              sort_by=sort_by,
                                              sort_sets_by=sort_sets_by,
                                              sum_over='a')
    assert df is not X

    # check equivalence to Series
    df1, intersections1, totals1 = _process_data(x,
                                                 sort_by=sort_by,
                                                 sort_sets_by=sort_sets_by,
                                                 sum_over=None)

    assert intersections.name == 'a'
    assert_frame_equal(df, df1.rename(columns={'_value': 'a'}))
    assert_series_equal(intersections, intersections1, check_names=False)
    assert_series_equal(totals, totals1)

    # check effect of extra column
    X = pd.DataFrame({'a': x, 'b': np.arange(len(x))})
    df2, intersections2, totals2 = _process_data(X,
                                                 sort_by=sort_by,
                                                 sort_sets_by=sort_sets_by,
                                                 sum_over='a')
    assert_series_equal(intersections, intersections2)
    assert_series_equal(totals, totals2)
    assert_frame_equal(df, df2.drop('b', axis=1))
    assert_array_equal(df2['b'], X['b'])  # disregard levels, tested above

    # check effect not dependent on order/name
    X = pd.DataFrame({'b': np.arange(len(x)), 'c': x})
    df3, intersections3, totals3 = _process_data(X,
                                                 sort_by=sort_by,
                                                 sort_sets_by=sort_sets_by,
                                                 sum_over='c')
    assert_series_equal(intersections, intersections3, check_names=False)
    assert intersections.name == 'a'
    assert intersections3.name == 'c'
    assert_series_equal(totals, totals3)
    assert_frame_equal(df.rename(columns={'a': 'c'}), df3.drop('b', axis=1))
    assert_array_equal(df3['b'], X['b'])

    # check sum_over=False
    X = pd.DataFrame({'b': np.ones(len(x), dtype=int), 'c': x})
    df4, intersections4, totals4 = _process_data(X,
                                                 sort_by=sort_by,
                                                 sort_sets_by=sort_sets_by,
                                                 sum_over='b')
    df5, intersections5, totals5 = _process_data(X,
                                                 sort_by=sort_by,
                                                 sort_sets_by=sort_sets_by,
                                                 sum_over=False)
    assert_series_equal(intersections4, intersections5, check_names=False)
    assert intersections4.name == 'b'
    assert intersections5.name == 'size'
    assert_series_equal(totals4, totals5)
    assert_frame_equal(df4, df5)
예제 #11
0
def test_subset_size_frame(x):
    kw = {'sort_by': 'cardinality', 'sort_categories_by': 'cardinality'}
    X = pd.DataFrame({'x': x})
    df_sum, intersections_sum, totals_sum = _process_data(X,
                                                          subset_size='sum',
                                                          sum_over='x',
                                                          **kw)
    df_count, intersections_count, totals_count = _process_data(
        X, subset_size='count', sum_over=None, **kw)

    # error cases: sum_over=False
    for subset_size in ['auto', 'sum', 'count']:
        with pytest.raises(ValueError, match='sum_over'):
            _process_data(X, subset_size=subset_size, sum_over=False, **kw)

    with pytest.raises(ValueError, match='sum_over'):
        _process_data(X, subset_size=subset_size, sum_over=False, **kw)

    # error cases: sum_over incompatible with subset_size
    with pytest.raises(ValueError, match='sum_over should be a field'):
        _process_data(X, subset_size='sum', sum_over=None, **kw)
    with pytest.raises(ValueError, match='sum_over cannot be set'):
        _process_data(X, subset_size='count', sum_over='x', **kw)

    # check subset_size='auto' or 'legacy' with sum_over=str => sum
    for subset_size in ['auto', 'legacy']:
        df, intersections, totals = _process_data(X,
                                                  subset_size=subset_size,
                                                  sum_over='x',
                                                  **kw)
        assert_frame_equal(df, df_sum)
        assert_series_equal(intersections, intersections_sum)
        assert_series_equal(totals, totals_sum)

    # check subset_size='auto' with sum_over=None => count
    df, intersections, totals = _process_data(X,
                                              subset_size='auto',
                                              sum_over=None,
                                              **kw)
    assert_frame_equal(df, df_count)
    assert_series_equal(intersections, intersections_count)
    assert_series_equal(totals, totals_count)

    # check legacy use of sum_over=False
    with pytest.warns(DeprecationWarning, match='sum_over=False'):
        df, intersections, totals = _process_data(X,
                                                  subset_size='legacy',
                                                  sum_over=False,
                                                  **kw)
    assert_frame_equal(df, df_count)
    assert_series_equal(intersections, intersections_count)
    assert_series_equal(totals, totals_count)