def test_leave_k_out_5(interactions_ds): """Test if error is thrown with an invalid value of max_concurrent_threads (negative).""" try: leave_k_out(interactions_ds, max_concurrent_threads=-1) except Exception as e: assert str( e) == 'The value of max_concurrent_threads (-1) must be > 0.'
def test_leave_k_out_3(interactions_ds): """Test if error is thrown with an invalid value of k (ratio variant with k higher than 1).""" try: leave_k_out(interactions_ds, 1.5) except Exception as e: assert str(e) == 'The k parameter should be in the (0, 1) range when it\'s used as the percentage of ' \ 'interactions to sample to the test set, per user. Current value: 1.5'
def test_leave_k_out_7(interactions_ds): """Test fixed k variant with value of k > 1. Should ignore users where #items <= k.""" train_ds, test_ds = leave_k_out(interactions_ds, k=2, seed=0) assert [[1, 4, 5, 50, 1], [2, 2, 5, 100, 3], [2, 3, 2, 20, 4]] == train_ds.values_list(to_list=True) assert [[1, 2, 3, 100, 0], [1, 5, 2, 25, 2]] == test_ds.values_list(to_list=True)
def test_leave_k_out_6(interactions_ds): """Test fixed k variant with value of k = 1.""" train_ds, test_ds = leave_k_out(interactions_ds, k=1, seed=0) assert [[1, 4, 5, 50, 1], [1, 5, 2, 25, 2], [2, 3, 2, 20, 4]] == train_ds.values_list(to_list=True) assert [[1, 2, 3, 100, 0], [2, 2, 5, 100, 3]] == test_ds.values_list(to_list=True)
def test_leave_k_out_14(interactions_ds): """Test ratio k variant with value of k resulting on no sampled records.""" train_ds, test_ds = leave_k_out(interactions_ds, k=0.3, seed=0) assert [[1, 2, 3, 100, 0], [1, 4, 5, 50, 1], [1, 5, 2, 25, 2], [2, 2, 5, 100, 3], [2, 3, 2, 20, 4]] == train_ds.values_list(to_list=True) assert [] == test_ds.values_list(to_list=True)
def test_leave_k_out_15(interactions_ds): """Test ratio k variant with min_user_interactions > 1. Should remove users from train and test sets that don't have at least min_user_interactions records.""" train_ds, test_ds = leave_k_out(interactions_ds, k=0.4, min_user_interactions=3, seed=0) assert [[1, 4, 5, 50, 1], [1, 5, 2, 25, 2]] == train_ds.values_list(to_list=True) assert [[1, 2, 3, 100, 0]] == test_ds.values_list(to_list=True)
def test_leave_k_out_16(interactions_ds): """Test ratio k variant with last_timestamps = True.""" train_ds, test_ds = leave_k_out(interactions_ds, k=0.5, last_timestamps=True, seed=0) assert [[1, 2, 3, 100, 0], [1, 4, 5, 50, 1], [2, 2, 5, 100, 3]] == train_ds.values_list(to_list=True) assert [[1, 5, 2, 25, 2], [2, 3, 2, 20, 4]] == test_ds.values_list(to_list=True)
def interactions_ds(): rng = random.Random(0) df = pd.DataFrame([[u, i, rng.randint(-1, 5)] for u in range(50) for i in range(200) if rng.randint(0, 4) == 0], columns=['user', 'item', 'interaction']) print(df.values) return leave_k_out(InteractionDataset.read_df(df), k=5, min_user_interactions=0, last_timestamps=False, seed=10)
def test_leave_k_out_17(interactions_ds_timestamp_label): """Test fixed k variant with last_timestamps = True with custom timestamp label.""" train_ds, test_ds = leave_k_out(interactions_ds_timestamp_label, k=0.5, last_timestamps=True, timestamp_label='custom_timestamp_label', seed=0) assert [[1, 2, 3, 100, 0], [1, 4, 5, 50, 1], [2, 2, 5, 100, 3]] == train_ds.values_list(to_list=True) assert [[1, 5, 2, 25, 2], [2, 3, 2, 20, 4]] == test_ds.values_list(to_list=True)
def get_test_dataset(ds_name, force_out_of_memory=False, verbose=True, **kwds): """Gets a test dataset. If the named dataset does not have a specific test file (example: BX dataset), a test InteractionDataset will be created using leave_k_out() from the Evaluation module on the full dataset. The split is deterministic (i.e. has a defined seed value). Might download the dataset if it hasn't been downloaded before. Args: ds_name: A string with the name of the requested dataset. This name should be present in the list returned by available_datasets(), otherwise an error will be thrown. force_out_of_memory: A boolean indicating whether to force dataset loading to out of memory. Default: False. verbose: A boolean indicating whether to log info messages or not. Default: True. Returns: A InteractionDataset containing the test dataset. """ if ds_name not in DATASETS: raise FileNotFoundError(f'"{ds_name}" is not a valid dataset. Supported datasets: {", ".join(available_datasets())}.') ds_options = DATASETS[ds_name] if ds_options.test_file is None: generated_path = os.path.join(get_dataset_path(ds_name), ds_name + '_test.gen') if os.path.exists(generated_path): # might have been generated already return get_dataset(ds_name, generated_path, is_generated=True, force_out_of_memory=force_out_of_memory, verbose=verbose, **kwds) # need to generate it now path = os.path.join(get_dataset_path(ds_name), ds_options.full_file) full_ds = get_dataset(ds_name, path, force_out_of_memory=force_out_of_memory, verbose=verbose, **kwds) train_ds, test_ds = leave_k_out(full_ds, k=10, min_user_interactions=10, seed=10) # store generated datasets for future calls train_ds.save(os.path.join(get_dataset_path(ds_name), ds_name + '_train.gen')) test_ds.save(os.path.join(get_dataset_path(ds_name), ds_name + '_test.gen')) return test_ds path = os.path.join(get_dataset_path(ds_name), ds_options.test_file) return get_dataset(ds_name, path, force_out_of_memory=force_out_of_memory, verbose=verbose, **kwds)
from DRecPy.Recommender import CDAE from DRecPy.Recommender.EarlyStopping import MaxValidationValueRule from DRecPy.Dataset import get_train_dataset from DRecPy.Dataset import get_test_dataset from DRecPy.Evaluation.Processes import ranking_evaluation from DRecPy.Evaluation.Splits import leave_k_out from DRecPy.Evaluation.Metrics import NDCG from DRecPy.Evaluation.Metrics import HitRatio from DRecPy.Evaluation.Metrics import Precision import time ds_train = get_train_dataset('ml-100k') ds_test = get_test_dataset('ml-100k') ds_train, ds_val = leave_k_out(ds_train, k=1, min_user_interactions=10, seed=0) def epoch_callback_fn(model): return {'val_' + metric: v for metric, v in ranking_evaluation(model, ds_val, n_pos_interactions=1, n_neg_interactions=100, generate_negative_pairs=True, k=10, verbose=False, seed=10, metrics=[HitRatio(), NDCG()]).items()} start_train = time.time() cdae = CDAE(hidden_factors=50, corruption_level=0.2, loss='bce', seed=10) cdae.fit(ds_train, learning_rate=0.001, reg_rate=0.001, epochs=100, batch_size=64, neg_ratio=5, epoch_callback_fn=epoch_callback_fn, epoch_callback_freq=10, early_stopping_rule=MaxValidationValueRule('val_HitRatio'), early_stopping_freq=10) print("Training took", time.time() - start_train)
from DRecPy.Recommender import DMF from DRecPy.Dataset import get_full_dataset from DRecPy.Evaluation.Splits import leave_k_out from DRecPy.Evaluation.Processes import ranking_evaluation from DRecPy.Evaluation.Metrics import NDCG from DRecPy.Evaluation.Metrics import HitRatio import time ds = get_full_dataset('ml-100k') ds_train, ds_test = leave_k_out(ds, k=1, last_timestamps=True, seed=10) ds_train_bin = ds_train.copy() ds_train_bin.apply('interaction', lambda x: 1) ds_test_bin = ds_test.copy() ds_test_bin.apply('interaction', lambda x: 1) for nce in [True, False]: print('NCE =', nce) start_train = time.time() dmf = DMF(use_nce=nce, user_factors=[128, 64], item_factors=[128, 64], seed=10) dmf.fit(ds_train if nce else ds_train_bin, epochs=50, batch_size=256, learning_rate=0.001, reg_rate=0.0001, neg_ratio=5) print("Training took", time.time() - start_train)
from DRecPy.Evaluation.Splits import leave_k_out from DRecPy.Dataset import get_full_dataset import time dataset = get_full_dataset("ml-100k") print('Full dataset', dataset) # Dataset is split by leaving k user interactions out from the train set. # If a given user does not have k interactions, all interactions stay on train set. # Although, if a given user has < min_user_interactions, it will be removed # from both sets. start_t = time.time() dataset_train, dataset_test = leave_k_out(dataset, k=10, min_user_interactions=20) print(f'Splitting complete. Took: {time.time() - start_t}s') print('Train dataset', dataset_train) print('Test dataset', dataset_test)
def test_leave_k_out_2(interactions_ds): """Test if error is thrown with an invalid value of k (ratio variant with negative k).""" try: leave_k_out(interactions_ds, -0.5) except Exception as e: assert str(e) == 'The value of k (-0.5) must be > 0.'
def test_leave_k_out_1(interactions_ds): """Test if error is thrown with an invalid value of k (negative).""" try: leave_k_out(interactions_ds, -999) except Exception as e: assert str(e) == 'The value of k (-999) must be > 0.'
def test_leave_k_out_0(interactions_ds): """Test if error is thrown with an invalid value of k (zero).""" try: leave_k_out(interactions_ds, 0) except Exception as e: assert str(e) == 'The value of k (0) must be > 0.'