示例#1
0
def test_search_with_undefined_labels(transactions, total_spent_fn):
    def total_spent(ds):
        return total_spent_fn(ds) % 3

    lm = LabelMaker(target_entity='customer_id',
                    time_index='time',
                    labeling_function=total_spent)

    n_examples = {1: 1, 2: 1}
    given_labels = lm.search(transactions,
                             num_examples_per_instance=n_examples,
                             gap=1)
    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '0,2019-01-01 08:30:00,1',
        '1,2019-01-01 09:30:00,2',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,1',
        '2,2019-01-01 11:30:00,2',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
示例#2
0
def test_label_type(transactions, total_spent_fn):
    lm = LabelMaker(target_entity='customer_id',
                    time_index='time',
                    labeling_function=total_spent_fn)
    lt = lm.search(transactions, num_examples_per_instance=1)
    assert lt.target_types['total_spent'] == 'continuous'
    assert lt.bin(2).target_types['total_spent'] == 'discrete'
示例#3
0
def test_search_offset_mix_2(transactions, total_spent_fn):
    """
    Test offset mix with window_size (absolute), minimum_data (relative), and gap (absolute).
    """
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='30min',
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=2,
        minimum_data=2,
        verbose=False,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,cutoff_time,total_spent',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 11:30:00,1',
        '2,2019-01-01 12:00:00,1',
    ]

    assert given_labels == labels
示例#4
0
def test_search_offset_mix_7(transactions, total_spent_fn):
    """
    Test offset mix with window_size (relative), minimum_data (relative), and gap (relative).
    """

    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size=10,
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=float('inf'),
        verbose=False,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,cutoff_time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '1,2019-01-01 09:00:00,3',
        '2,2019-01-01 10:30:00,4',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
示例#5
0
def test_search_offset_mix_3(transactions, total_spent_fn):
    """
    Test offset mix with window_size (absolute), minimum_data (absolute), and gap (relative).
    """
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='8h',
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=-1,
        minimum_data='2019-01-01 08:00:00',
        gap=1,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '0,2019-01-01 08:30:00,1',
        '1,2019-01-01 09:00:00,3',
        '1,2019-01-01 09:30:00,2',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,4',
        '2,2019-01-01 11:00:00,3',
        '2,2019-01-01 11:30:00,2',
        '2,2019-01-01 12:00:00,1',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
示例#6
0
def test_column_based_windows(transactions, total_spent_fn):
    session_id = [1, 2, 3, 3, 4, 5, 5, 5, 6, 7]
    df = transactions.assign(session_id=session_id)

    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        window_size='session_id',
        labeling_function=total_spent_fn,
    )

    actual = lm.search(df, -1).pipe(to_csv, index=False)

    expected = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,1',
        '0,2019-01-01 08:30:00,1',
        '1,2019-01-01 09:00:00,2',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,3',
        '2,2019-01-01 12:00:00,1',
        '3,2019-01-01 12:30:00,1',
    ]

    assert actual == expected
示例#7
0
def test_label_type(transactions, total_spent_fn):
    lm = LabelMaker(target_entity='customer_id',
                    time_index='time',
                    labeling_function=total_spent_fn)
    lt = lm.search(transactions,
                   num_examples_per_instance=1,
                   label_type='discrete',
                   verbose=False)
    assert lt.label_type == 'discrete'
示例#8
0
def test_data_slice_overlap(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='1h',
    )

    for ds in lm.slice(transactions, num_examples_per_instance=2):
        overlap = ds.index == ds.context.slice_stop
        assert not overlap.any()
示例#9
0
def test_slice_overlap(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='1h',
    )

    for df in lm.slice(transactions, num_examples_per_instance=2):
        start, end = df.context.window
        is_overlap = df.index == end
        assert not is_overlap.any()
示例#10
0
def test_search_invalid_n_examples(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
    )

    with pytest.raises(AssertionError, match='must specify gap'):
        next(lm.slice(transactions, num_examples_per_instance=2))

    with pytest.raises(AssertionError, match='must specify gap'):
        lm.search(transactions, num_examples_per_instance=2)
示例#11
0
def test_slice_context(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='1h',
    )

    for ds in lm.slice(transactions,
                       num_examples_per_instance=-1,
                       verbose=True):
        assert ds.context.target_entity == 'customer_id'
        assert isinstance(ds.context.target_instance, int)
        assert isinstance(ds.context.slice_number, int)
        assert ds.context.window == ds.context.gap
示例#12
0
def test_search_default(transactions, total_spent_fn):
    lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent_fn)

    given_labels = lm.search(transactions, num_examples_per_instance=1)
    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '1,2019-01-01 09:00:00,3',
        '2,2019-01-01 10:30:00,4',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
示例#13
0
def test_invalid_threshold(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda: None,
        window_size=2,
    )

    match = 'invalid threshold'

    with pytest.raises(ValueError, match=match):
        lm.search(
            transactions,
            num_examples_per_instance=2,
            minimum_data=' ',
        )
示例#14
0
def test_search_on_empty_labels(transactions):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda ds: None,
        window_size=2,
    )

    given_labels = lm.search(
        transactions,
        minimum_data=1,
        num_examples_per_instance=2,
        gap=1,
    )

    assert given_labels.empty
示例#15
0
def test_search_offset_negative_1(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda: None,
        window_size=2,
    )

    match = 'offset must be positive'
    with pytest.raises(AssertionError, match=match):
        lm.search(
            transactions,
            num_examples_per_instance=2,
            minimum_data='-1h',
            gap='-1h',
        )
示例#16
0
def test_search_offset_negative_0(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda: None,
        window_size=2,
    )

    match = 'must be greater than zero'

    with pytest.raises(AssertionError, match=match):
        lm.search(
            transactions,
            num_examples_per_instance=2,
            minimum_data=-1,
            gap=-1,
            verbose=False,
        )
示例#17
0
def test_search_on_empty_data_slices(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda df: None,
        window_size=2,
    )

    transactions = transactions.assign(time=pd.NaT)

    given_labels = lm.search(
        transactions,
        minimum_data=1,
        num_examples_per_instance=2,
        gap=3,
    )

    assert given_labels.empty
示例#18
0
def test_invalid_offset(transactions, total_spent_fn):
    match = 'invalid offset'

    with pytest.raises(AssertionError, match=match):
        LabelMaker(
            target_entity='customer_id',
            time_index='time',
            labeling_function=lambda: None,
            window_size={},
        )
示例#19
0
def test_invalid_offset_alias(transactions, total_spent_fn):
    match = 'offset must be a valid string'

    with pytest.raises(AssertionError, match=match):
        LabelMaker(
            target_entity='customer_id',
            time_index='time',
            labeling_function=lambda: None,
            window_size='not an offset alias',
        )
示例#20
0
def test_search_with_multiple_targets(transactions, total_spent_fn,
                                      unique_amounts_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        window_size=2,
        labeling_function={
            'total_spent': total_spent_fn,
            'unique_amounts': unique_amounts_fn,
        },
    )

    expected = [
        'customer_id,time,total_spent,unique_amounts',
        '0,2019-01-01 08:00:00,2,1',
        '1,2019-01-01 09:00:00,2,1',
        '1,2019-01-01 10:00:00,1,1',
        '2,2019-01-01 10:30:00,2,1',
        '2,2019-01-01 11:30:00,2,1',
        '3,2019-01-01 12:30:00,1,1',
    ]

    lt = lm.search(transactions, num_examples_per_instance=-1)
    actual = lt.pipe(to_csv, index=False)
    info = 'unexpected calculated values'
    assert actual == expected, info

    expected = [
        'customer_id,time,unique_amounts',
        '0,2019-01-01 08:00:00,1',
        '1,2019-01-01 09:00:00,1',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,1',
        '2,2019-01-01 11:30:00,1',
        '3,2019-01-01 12:30:00,1',
    ]

    actual = lt.select('unique_amounts')
    actual = actual.pipe(to_csv, index=False)
    info = 'selected values differ from calculated values'
    assert actual == expected, info
示例#21
0
def labels():
    df = ft.demo.load_mock_customer(return_single_table=True, random_seed=0)
    df = df[['transaction_time', 'customer_id', 'amount']]
    df.sort_values('transaction_time', inplace=True)

    lm = LabelMaker(
        target_entity='customer_id',
        time_index='transaction_time',
        labeling_function=total_spent,
        window_size='1h',
    )

    lt = lm.search(
        df,
        minimum_data='10min',
        num_examples_per_instance=2,
        gap='30min',
        drop_empty=True,
        verbose=False,
    )

    lt = lt.threshold(1250)
    return lt
示例#22
0
def test_search_with_invalid_index(transactions, total_spent_fn):
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=lambda df: None,
        window_size=2,
    )

    df = transactions.sample(n=10, random_state=0)
    match = "data frame must be sorted chronologically"
    with pytest.raises(AssertionError, match=match):
        lm.search(df, num_examples_per_instance=2)

    df = transactions.assign(time=pd.NaT)
    match = "index contains null values"
    with pytest.raises(AssertionError, match=match):
        lm.search(df, num_examples_per_instance=2)