Exemplo n.º 1
0
def test_search_offset_mix_7(transactions, total_spent_fn):
    """
    Test offset mix with window_size (relative), minimum_data (relative), and gap (relative).
    """

    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size=10,
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=float('inf'),
        verbose=False,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,cutoff_time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '1,2019-01-01 09:00:00,3',
        '2,2019-01-01 10:30:00,4',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
Exemplo n.º 2
0
def test_search_offset_mix_2(transactions, total_spent_fn):
    """
    Test offset mix with window_size (absolute), minimum_data (relative), and gap (absolute).
    """
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='30min',
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=2,
        minimum_data=2,
        verbose=False,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,cutoff_time,total_spent',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 11:30:00,1',
        '2,2019-01-01 12:00:00,1',
    ]

    assert given_labels == labels
Exemplo n.º 3
0
def test_search_offset_mix_3(transactions, total_spent_fn):
    """
    Test offset mix with window_size (absolute), minimum_data (absolute), and gap (relative).
    """
    lm = LabelMaker(
        target_entity='customer_id',
        time_index='time',
        labeling_function=total_spent_fn,
        window_size='8h',
    )

    given_labels = lm.search(
        transactions,
        num_examples_per_instance=-1,
        minimum_data='2019-01-01 08:00:00',
        gap=1,
    )

    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '0,2019-01-01 08:30:00,1',
        '1,2019-01-01 09:00:00,3',
        '1,2019-01-01 09:30:00,2',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,4',
        '2,2019-01-01 11:00:00,3',
        '2,2019-01-01 11:30:00,2',
        '2,2019-01-01 12:00:00,1',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
Exemplo n.º 4
0
def test_search_with_undefined_labels(transactions, total_spent_fn):
    def total_spent(ds):
        return total_spent_fn(ds) % 3

    lm = LabelMaker(target_entity='customer_id',
                    time_index='time',
                    labeling_function=total_spent)

    n_examples = {1: 1, 2: 1}
    given_labels = lm.search(transactions,
                             num_examples_per_instance=n_examples,
                             gap=1)
    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '0,2019-01-01 08:30:00,1',
        '1,2019-01-01 09:30:00,2',
        '1,2019-01-01 10:00:00,1',
        '2,2019-01-01 10:30:00,1',
        '2,2019-01-01 11:30:00,2',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
Exemplo n.º 5
0
def test_sample_frac_int(labels):
    given_answer = labels.sample(frac=.25, random_state=0)
    given_answer = given_answer.sort_index()
    given_answer = to_csv(given_answer, index=True)

    answer = [
        'label_id,customer_id,cutoff_time,my_labeling_function',
        '2,2,2014-01-01 00:01:00,True',
    ]

    assert given_answer == answer
Exemplo n.º 6
0
def test_distribution_categorical(total_spent):
    labels = range(2)
    given_answer = total_spent.bin(2, labels=labels).distribution
    given_answer = to_csv(given_answer)

    answer = [
        'total_spent,count',
        '0,5',
        '1,5',
    ]

    assert given_answer == answer
Exemplo n.º 7
0
def test_count(total_spent):
    given_answer = total_spent.count
    given_answer = to_csv(given_answer, index=True)

    answer = [
        'customer_id,count',
        '0,2',
        '1,3',
        '2,4',
        '3,1',
    ]

    assert given_answer == answer
Exemplo n.º 8
0
def test_sample_frac_per_label(labels):
    frac = {True: 1., False: .5}
    given_answer = labels.sample(frac=frac, random_state=0)
    given_answer = given_answer.sort_index()
    given_answer = to_csv(given_answer, index=True)

    answer = [
        'label_id,customer_id,cutoff_time,my_labeling_function',
        '0,1,2014-01-01 00:45:00,True',
        '2,2,2014-01-01 00:01:00,True',
        '3,2,2014-01-01 00:04:00,False',
    ]

    assert given_answer == answer
Exemplo n.º 9
0
def test_sample_n_per_label(labels):
    n = {True: 1, False: 2}
    given_answer = labels.sample(n=n, random_state=0)
    given_answer = given_answer.sort_index()
    given_answer = to_csv(given_answer, index=True)

    answer = [
        'label_id,customer_id,cutoff_time,my_labeling_function',
        '1,1,2014-01-01 00:48:00,False',
        '2,2,2014-01-01 00:01:00,True',
        '3,2,2014-01-01 00:04:00,False',
    ]

    assert given_answer == answer
Exemplo n.º 10
0
def test_search_default(transactions, total_spent_fn):
    lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent_fn)

    given_labels = lm.search(transactions, num_examples_per_instance=1)
    given_labels = to_csv(given_labels, index=False)

    labels = [
        'customer_id,time,total_spent',
        '0,2019-01-01 08:00:00,2',
        '1,2019-01-01 09:00:00,3',
        '2,2019-01-01 10:30:00,4',
        '3,2019-01-01 12:30:00,1',
    ]

    assert given_labels == labels
Exemplo n.º 11
0
def test_count_by_time_categorical(total_spent):
    given_answer = total_spent.bin(2, labels=range(2))
    given_answer = to_csv(given_answer.count_by_time)

    answer = [
        'time,0,1',
        '2019-01-01 08:00:00,0,1',
        '2019-01-01 08:30:00,0,2',
        '2019-01-01 09:00:00,0,3',
        '2019-01-01 09:30:00,0,4',
        '2019-01-01 10:00:00,0,5',
        '2019-01-01 10:30:00,1,5',
        '2019-01-01 11:00:00,2,5',
        '2019-01-01 11:30:00,3,5',
        '2019-01-01 12:00:00,4,5',
        '2019-01-01 12:30:00,5,5',
    ]

    assert given_answer == answer
Exemplo n.º 12
0
def test_count_by_time_continuous(total_spent):
    given_answer = total_spent.count_by_time
    given_answer = to_csv(given_answer, header=True, index=True)

    answer = [
        'time,total_spent',
        '2019-01-01 08:00:00,1',
        '2019-01-01 08:30:00,2',
        '2019-01-01 09:00:00,3',
        '2019-01-01 09:30:00,4',
        '2019-01-01 10:00:00,5',
        '2019-01-01 10:30:00,6',
        '2019-01-01 11:00:00,7',
        '2019-01-01 11:30:00,8',
        '2019-01-01 12:00:00,9',
        '2019-01-01 12:30:00,10',
    ]

    assert given_answer == answer
Exemplo n.º 13
0
def test_count_by_time_categorical(total_spent):
    labels = range(2)
    given_answer = total_spent.bin(2, labels=labels).count_by_time
    given_answer = to_csv(given_answer)

    answer = [
        'cutoff_time,0,1',
        '2019-01-01 08:00:00,0.0,1.0',
        '2019-01-01 08:30:00,0.0,2.0',
        '2019-01-01 09:00:00,0.0,3.0',
        '2019-01-01 09:30:00,0.0,4.0',
        '2019-01-01 10:00:00,0.0,5.0',
        '2019-01-01 10:30:00,1.0,5.0',
        '2019-01-01 11:00:00,2.0,5.0',
        '2019-01-01 11:30:00,3.0,5.0',
        '2019-01-01 12:00:00,4.0,5.0',
        '2019-01-01 12:30:00,5.0,5.0',
    ]

    assert given_answer == answer