def train_initial_model(): dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) model = ImplicitFactorizationModel(loss='adaptive_hinge', embedding_dim=128, # latent dimensionality n_iter=10, # number of epochs of training batch_size=1024, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, use_cuda=torch.cuda.is_available()) print('Fitting the model') model.fit(train, verbose=True) print(type(model)) model_file = open('models/filmclub.model', 'wb') pickle.dump(model, model_file) model_file.close() dataset.num_users = 1000000 dataset_file = open('data/dataset.pkl', 'wb') pickle.dump(dataset, dataset_file) dataset_file.close() train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
def main(args): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) args = parse_args(args) # Fix random_state seed = 72 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if args.dataset == 'amazon': max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length dataset = get_amazon_dataset() elif args.dataset == 'goodbooks': dataset = get_goodbooks_dataset() else: dataset = get_movielens_dataset(args.dataset.upper()) args.variant = args.dataset train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('model: {}, data: {}'.format(args.model, train)) fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space(args.model) for iteration in range(args.num_trials): print('Iteration {}'.format(iteration)) trials = optimize(objective, space, trials_fname=fname, max_evals=iteration + 1) summarize_trials(trials)
def generate_dataset_table(): headers = ['Dataset', 'Users', 'Items', 'Density', '95th/50th'] rows = [] for name, dataset in (('Movielens 10M', get_movielens_dataset('10M')), ('Amazon', get_amazon_dataset()), ('Goodbooks', get_goodbooks_dataset())): item_counts = dataset.tocoo().getnnz(axis=0) print('Dataset {}, ratio: {:0,}' .format(name, np.percentile(item_counts, 95) / np.percentile(item_counts, 50))) row = [ name, '{:0,}'.format(dataset.num_users), '{:0,}'.format(dataset.num_items), len(dataset) / dataset.num_users / dataset.num_items, '{0:.2f}'.format(np.percentile(item_counts, 95) / np.percentile(item_counts, 50)) ] rows.append(row) return _full_width_table( tabulate(rows, headers=headers, floatfmt='.4f', tablefmt='latex_booktabs'))
def load_data(dataset, random_state): dataset = get_movielens_dataset(dataset) # np.random.shuffle(dataset.timestamps) # max_sequence_length = int(np.percentile(dataset.tocsr() # .getnnz(axis=1), # 80)) max_sequence_length = 100 min_sequence_length = 50 step_size = max_sequence_length train_nonsequence, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train_nonsequence.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) return train_nonsequence, train, validation, test
def test_bloom(compression_ratio, expected_rmse): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ExplicitFactorizationModel(loss='regression', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-5, representation=network, use_cuda=CUDA) model.fit(train) print(model) rmse = rmse_score(model, test) print(rmse) assert rmse - EPSILON < expected_rmse
def data_implicit_sequence(): max_sequence_length = 200 min_sequence_length = 20 step_size = 200 interactions = movielens.get_movielens_dataset('100K') train, test = user_based_train_test_split(interactions, random_state=RANDOM_STATE) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) model = ImplicitSequenceModel(loss='adaptive_hinge', representation='lstm', batch_size=8, learning_rate=1e-2, l2=1e-3, n_iter=2, use_cuda=CUDA, random_state=RANDOM_STATE) model.fit(train, verbose=True) return train, test, model
def test_bpr_bloom(compression_ratio, expected_mrr): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, representation=network, use_cuda=CUDA) model.fit(train) print(model) mrr = mrr_score(model, test, train=train).mean() assert mrr > expected_mrr
def data(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) return train, test
def get_sequence_data(): dataset = get_movielens_dataset('1M') max_sequence_length = 200 min_sequence_length = 20 data = dataset.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=max_sequence_length) print(data.sequences.shape) return data
def load_data(dataset, random_state): dataset = get_movielens_dataset(dataset) train, rest = random_train_test_split(dataset, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) return train, validation, test
def test_user_based_split(): interactions = movielens.get_movielens_dataset('100K') train, test = (cross_validation.user_based_train_test_split( interactions, test_percentage=0.2, random_state=RANDOM_STATE)) assert len(train) + len(test) == len(interactions) users_in_test = len(np.unique(test.user_ids)) assert np.allclose(float(users_in_test) / interactions.num_users, 0.2, atol=0.001)
def test_to_sequence_min_length(): min_sequence_length = 10 interactions = movielens.get_movielens_dataset('100K') # Check that with default arguments there are sequences # that are shorter than we want sequences = interactions.to_sequence(max_sequence_length=20) assert np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length) # But no such sequences after we specify min length. sequences = interactions.to_sequence(max_sequence_length=20, min_sequence_length=min_sequence_length) assert not np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length)
def test_user_based_split(): interactions = movielens.get_movielens_dataset('100K') train, test = (cross_validation .user_based_train_test_split(interactions, test_percentage=0.2, random_state=RANDOM_STATE)) assert len(train) + len(test) == len(interactions) users_in_test = len(np.unique(test.user_ids)) assert np.allclose(float(users_in_test) / interactions.num_users, 0.2, atol=0.001)
def test_to_sequence_min_length(): min_sequence_length = 10 interactions = movielens.get_movielens_dataset('100K') # Check that with default arguments there are sequences # that are shorter than we want sequences = interactions.to_sequence(max_sequence_length=20) assert np.any((sequences.sequences != 0).sum(axis=1) < min_sequence_length) # But no such sequences after we specify min length. sequences = interactions.to_sequence( max_sequence_length=20, min_sequence_length=min_sequence_length) assert not np.any( (sequences.sequences != 0).sum(axis=1) < min_sequence_length)
def test_to_sequence(max_sequence_length): interactions = movielens.get_movielens_dataset('100K') sequences = interactions.to_sequence( max_sequence_length=max_sequence_length) assert sequences.sequences.shape == (len(interactions), max_sequence_length) _test_just_padding(sequences.sequences) _test_final_column_no_padding(sequences.sequences) _test_shifted(sequences.sequences) _test_temporal_order(sequences.user_ids, sequences.sequences, interactions)
def test_precision_recall(data, k): (train, test, model) = data interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) precision, recall = precision_recall_score(model, test, train, k=k) assert precision.shape == recall.shape if not isinstance(k, list): assert len(precision.shape) == 1 else: assert precision.shape[1] == len(k)
def test_adaptive_hinge(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='adaptive_hinge', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.07
def data(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=1, batch_size=1024, learning_rate=1e-2, l2=1e-6, random_state=RANDOM_STATE, use_cuda=CUDA) model.fit(train) return train, test, model
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def test_predict_movielens(model_class): interactions = movielens.get_movielens_dataset('100K') model = model_class(n_iter=1, use_cuda=CUDA) model.fit(interactions) for user_id in np.random.randint(0, interactions.num_users, size=10): user_ids = np.repeat(user_id, interactions.num_items) item_ids = np.arange(interactions.num_items) uid_predictions = model.predict(user_id) iid_predictions = model.predict(user_id, item_ids) pair_predictions = model.predict(user_ids, item_ids) assert (uid_predictions == iid_predictions).all() assert (uid_predictions == pair_predictions).all()
def test_to_sequence(max_sequence_length, step_size): interactions = movielens.get_movielens_dataset('100K') _, interactions = random_train_test_split(interactions) sequences = interactions.to_sequence( max_sequence_length=max_sequence_length, step_size=step_size) if step_size == 1: assert sequences.sequences.shape == (len(interactions), max_sequence_length) else: assert sequences.sequences.shape[1] == max_sequence_length _test_just_padding(sequences.sequences) _test_final_column_no_padding(sequences.sequences) _test_shifted(sequences.user_ids, sequences.sequences, step_size) _test_temporal_order(sequences.user_ids, sequences.sequences, interactions)
def test_bpr(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr + EPSILON > 0.07
def test_check_input(): # Train for single iter. interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='regression', n_iter=1, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) # Modify data to make imcompatible with original model. train.user_ids[0] = train.user_ids.max() + 1 with pytest.raises(ValueError): model.fit(train)
def load_data(dataset, random_state): if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() else: dataset = get_movielens_dataset(dataset) train, rest = random_train_test_split(dataset, test_percentage=0.05, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) return train, validation, test
def test_bpr_custom_optimizer(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6): return torch.optim.Adagrad(model_params, lr=lr, weight_decay=weight_decay) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, optimizer_func=adagrad_optimizer) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.06
def test_logistic(): interactions = movielens.get_movielens_dataset('100K') # Convert to binary interactions.ratings = (interactions.ratings > 3).astype(np.float32) # Convert from (0, 1) to (-1, 1) interactions.ratings = interactions.ratings * 2 - 1 train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='logistic', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6, use_cuda=CUDA) model.fit(train) rmse = rmse_score(model, test) assert rmse - EPSILON < 1.05
def load_data(dataset, random_state): max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() # This is a dataset with shorter sequences max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length else: dataset = get_movielens_dataset(dataset) train_nonsequence, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train_nonsequence.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) return train_nonsequence, train, validation, test
def generate_data(size_variant, **kwargs): dataset = get_movielens_dataset(variant=size_variant) return dataset
test_mrr.mean(), val_mrr.mean() )) results.save(hyperparameters, test_mrr.mean(), val_mrr.mean()) return results if __name__ == '__main__': max_sequence_length = 200 min_sequence_length = 20 step_size = 200 random_state = np.random.RandomState(100) dataset = get_movielens_dataset('1M') train, rest = user_based_train_test_split(dataset, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size)
def get_data(): dataset = get_movielens_dataset(variant='100K') return dataset
print('Test MRR {} val MRR {}'.format(test_mrr.mean(), val_mrr.mean())) results.save(hyperparameters, test_mrr.mean(), val_mrr.mean()) return results if __name__ == '__main__': max_sequence_length = 200 min_sequence_length = 20 step_size = 200 random_state = np.random.RandomState(100) dataset = get_movielens_dataset('1M') train, rest = user_based_train_test_split(dataset, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length,
def get_factorization_data(): dataset = get_movielens_dataset('1M') return dataset
from spotlight.cross_validation import random_train_test_split from spotlight.datasets.movielens import get_movielens_dataset from spotlight.evaluation import rmse_score from spotlight.factorization.explicit import ExplicitFactorizationModel dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset) model = ExplicitFactorizationModel(n_iter=1) model.fit(train) rmse = rmse_score(model, test) print(rmse)