def test_search_offset_mix_7(transactions, total_spent_fn): """ Test offset mix with window_size (relative), minimum_data (relative), and gap (relative). """ lm = LabelMaker( target_entity='customer_id', time_index='time', labeling_function=total_spent_fn, window_size=10, ) given_labels = lm.search( transactions, num_examples_per_instance=float('inf'), verbose=False, ) given_labels = to_csv(given_labels, index=False) labels = [ 'customer_id,cutoff_time,total_spent', '0,2019-01-01 08:00:00,2', '1,2019-01-01 09:00:00,3', '2,2019-01-01 10:30:00,4', '3,2019-01-01 12:30:00,1', ] assert given_labels == labels
def test_search_offset_mix_2(transactions, total_spent_fn): """ Test offset mix with window_size (absolute), minimum_data (relative), and gap (absolute). """ lm = LabelMaker( target_entity='customer_id', time_index='time', labeling_function=total_spent_fn, window_size='30min', ) given_labels = lm.search( transactions, num_examples_per_instance=2, minimum_data=2, verbose=False, ) given_labels = to_csv(given_labels, index=False) labels = [ 'customer_id,cutoff_time,total_spent', '1,2019-01-01 10:00:00,1', '2,2019-01-01 11:30:00,1', '2,2019-01-01 12:00:00,1', ] assert given_labels == labels
def test_search_offset_mix_3(transactions, total_spent_fn): """ Test offset mix with window_size (absolute), minimum_data (absolute), and gap (relative). """ lm = LabelMaker( target_entity='customer_id', time_index='time', labeling_function=total_spent_fn, window_size='8h', ) given_labels = lm.search( transactions, num_examples_per_instance=-1, minimum_data='2019-01-01 08:00:00', gap=1, ) given_labels = to_csv(given_labels, index=False) labels = [ 'customer_id,time,total_spent', '0,2019-01-01 08:00:00,2', '0,2019-01-01 08:30:00,1', '1,2019-01-01 09:00:00,3', '1,2019-01-01 09:30:00,2', '1,2019-01-01 10:00:00,1', '2,2019-01-01 10:30:00,4', '2,2019-01-01 11:00:00,3', '2,2019-01-01 11:30:00,2', '2,2019-01-01 12:00:00,1', '3,2019-01-01 12:30:00,1', ] assert given_labels == labels
def test_search_with_undefined_labels(transactions, total_spent_fn): def total_spent(ds): return total_spent_fn(ds) % 3 lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent) n_examples = {1: 1, 2: 1} given_labels = lm.search(transactions, num_examples_per_instance=n_examples, gap=1) given_labels = to_csv(given_labels, index=False) labels = [ 'customer_id,time,total_spent', '0,2019-01-01 08:00:00,2', '0,2019-01-01 08:30:00,1', '1,2019-01-01 09:30:00,2', '1,2019-01-01 10:00:00,1', '2,2019-01-01 10:30:00,1', '2,2019-01-01 11:30:00,2', '3,2019-01-01 12:30:00,1', ] assert given_labels == labels
def test_sample_frac_int(labels): given_answer = labels.sample(frac=.25, random_state=0) given_answer = given_answer.sort_index() given_answer = to_csv(given_answer, index=True) answer = [ 'label_id,customer_id,cutoff_time,my_labeling_function', '2,2,2014-01-01 00:01:00,True', ] assert given_answer == answer
def test_distribution_categorical(total_spent): labels = range(2) given_answer = total_spent.bin(2, labels=labels).distribution given_answer = to_csv(given_answer) answer = [ 'total_spent,count', '0,5', '1,5', ] assert given_answer == answer
def test_count(total_spent): given_answer = total_spent.count given_answer = to_csv(given_answer, index=True) answer = [ 'customer_id,count', '0,2', '1,3', '2,4', '3,1', ] assert given_answer == answer
def test_sample_frac_per_label(labels): frac = {True: 1., False: .5} given_answer = labels.sample(frac=frac, random_state=0) given_answer = given_answer.sort_index() given_answer = to_csv(given_answer, index=True) answer = [ 'label_id,customer_id,cutoff_time,my_labeling_function', '0,1,2014-01-01 00:45:00,True', '2,2,2014-01-01 00:01:00,True', '3,2,2014-01-01 00:04:00,False', ] assert given_answer == answer
def test_sample_n_per_label(labels): n = {True: 1, False: 2} given_answer = labels.sample(n=n, random_state=0) given_answer = given_answer.sort_index() given_answer = to_csv(given_answer, index=True) answer = [ 'label_id,customer_id,cutoff_time,my_labeling_function', '1,1,2014-01-01 00:48:00,False', '2,2,2014-01-01 00:01:00,True', '3,2,2014-01-01 00:04:00,False', ] assert given_answer == answer
def test_search_default(transactions, total_spent_fn): lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent_fn) given_labels = lm.search(transactions, num_examples_per_instance=1) given_labels = to_csv(given_labels, index=False) labels = [ 'customer_id,time,total_spent', '0,2019-01-01 08:00:00,2', '1,2019-01-01 09:00:00,3', '2,2019-01-01 10:30:00,4', '3,2019-01-01 12:30:00,1', ] assert given_labels == labels
def test_count_by_time_categorical(total_spent): given_answer = total_spent.bin(2, labels=range(2)) given_answer = to_csv(given_answer.count_by_time) answer = [ 'time,0,1', '2019-01-01 08:00:00,0,1', '2019-01-01 08:30:00,0,2', '2019-01-01 09:00:00,0,3', '2019-01-01 09:30:00,0,4', '2019-01-01 10:00:00,0,5', '2019-01-01 10:30:00,1,5', '2019-01-01 11:00:00,2,5', '2019-01-01 11:30:00,3,5', '2019-01-01 12:00:00,4,5', '2019-01-01 12:30:00,5,5', ] assert given_answer == answer
def test_count_by_time_continuous(total_spent): given_answer = total_spent.count_by_time given_answer = to_csv(given_answer, header=True, index=True) answer = [ 'time,total_spent', '2019-01-01 08:00:00,1', '2019-01-01 08:30:00,2', '2019-01-01 09:00:00,3', '2019-01-01 09:30:00,4', '2019-01-01 10:00:00,5', '2019-01-01 10:30:00,6', '2019-01-01 11:00:00,7', '2019-01-01 11:30:00,8', '2019-01-01 12:00:00,9', '2019-01-01 12:30:00,10', ] assert given_answer == answer
def test_count_by_time_categorical(total_spent): labels = range(2) given_answer = total_spent.bin(2, labels=labels).count_by_time given_answer = to_csv(given_answer) answer = [ 'cutoff_time,0,1', '2019-01-01 08:00:00,0.0,1.0', '2019-01-01 08:30:00,0.0,2.0', '2019-01-01 09:00:00,0.0,3.0', '2019-01-01 09:30:00,0.0,4.0', '2019-01-01 10:00:00,0.0,5.0', '2019-01-01 10:30:00,1.0,5.0', '2019-01-01 11:00:00,2.0,5.0', '2019-01-01 11:30:00,3.0,5.0', '2019-01-01 12:00:00,4.0,5.0', '2019-01-01 12:30:00,5.0,5.0', ] assert given_answer == answer