def train_initial_model(): dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) model = ImplicitFactorizationModel(loss='adaptive_hinge', embedding_dim=128, # latent dimensionality n_iter=10, # number of epochs of training batch_size=1024, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, use_cuda=torch.cuda.is_available()) print('Fitting the model') model.fit(train, verbose=True) print(type(model)) model_file = open('models/filmclub.model', 'wb') pickle.dump(model, model_file) model_file.close() dataset.num_users = 1000000 dataset_file = open('data/dataset.pkl', 'wb') pickle.dump(dataset, dataset_file) dataset_file.close() train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
def test_bpr_bloom(compression_ratio, expected_mrr): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, representation=network, use_cuda=CUDA) model.fit(train) print(model) mrr = mrr_score(model, test, train=train).mean() assert mrr > expected_mrr
def model_implicit_factorization (self, train: Interactions, random_state: np.random.RandomState, hyperparameters: dict = None) -> ImplicitFactorizationModel: """Trains a Spotlight implicit matrix factorization model. Args: train (spotlight.interactions.Interactions): Training set as an interactions matrix. random_state (np.random.RandomState): Random state to use when fitting. hyperparameters (dict, optional): A number of hyperparameters for the model, either sampled from sample_implicit_hyperparameters or default used by model. Defaults can be found in global variable DEFAULT_PARAMS. Returns: spotlight.factorization.implicit.ImplicitFactorizationModel: A Spotlight implicit matrix factorization model. """ logger = logging.getLogger() if hyperparameters: logger.info("Beginning fitting implicit model... \n Hyperparameters: \n {0}".format( json.dumps({i:hyperparameters[i] for i in hyperparameters if i != 'use_cuda'}) )) model = ImplicitFactorizationModel( loss=hyperparameters['loss'], learning_rate=hyperparameters['learning_rate'], batch_size=hyperparameters['batch_size'], embedding_dim=hyperparameters['embedding_dim'], n_iter=hyperparameters['n_iter'], l2=hyperparameters['l2'], use_cuda=True, random_state=random_state ) else: logger.info("Beginning fitting implicit model with default hyperparameters...") model = ImplicitFactorizationModel(use_cuda=True) model.fit(train, verbose=True) return model
def run(self, filtering, loss, k): for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) self.filter = filtering self.loss = loss self.model_name = str.join('_', (self.model_name, self.filter, self.loss)) self.logger(self.model_name) logger = logging.getLogger() NUM_EPOCHS = 5 logger.info("Training Spotlight Model, Loss: {}".format(self.loss)) df_interactions, df_timestamps = self.df[[ 'user_id', 'tag_id', 'count' ]], self.df['timestamp'] interactions = self.build_interactions_object(df_interactions, df_timestamps) train, test = spotlight_random_train_test_split(interactions) logger.info( 'The dataset has %s users and %s items with %s interactions in the test and %s interactions in the ' 'training set.' % (train.num_users, train.num_items, test.tocoo().getnnz(), train.tocoo().getnnz())) model = ImplicitFactorizationModel( n_iter=NUM_EPOCHS, loss=self.loss, random_state=RANDOM_STATE, use_cuda=True, embedding_dim=64, # latent dimensionality batch_size=128, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, ) logger.info("Begin fitting {0} model for {1} epochs...".format( self.loss, NUM_EPOCHS)) model.fit(train, verbose=True) precrec = precision_recall_score(model=model, train=train, test=test, k=k) mrr = mrr_score(model=model, train=train, test=test).mean() precision = np.mean(precrec[0]) recall = np.mean(precrec[1]) fmeasure = 2 * ((precision * recall) / (precision + recall)) logger.info("Precision@{0}: {1}".format(k, precision)) logger.info("Recall@{0}: {1}".format(k, recall)) logger.info("F-Measure: {}".format(fmeasure)) logger.info("MRR: {}".format(mrr)) self.model_name = 'spot'
class EmbeddingFactorsRecommender(BaseFactorizationRecommender): default_model_params = dict( loss='adaptive_hinge', # 'bpr', 'hinge', 'adaptive hinge' embedding_dim=32, n_iter=15, batch_size=1024, l2=0.0, learning_rate=1e-2, num_negative_samples=10) default_fit_params = dict(verbose=True) def _prep_for_fit(self, train_obs, **fit_params): # self.toggle_mkl_blas_1_thread(False) self._set_data(train_obs) self.set_params(**fit_params) self.model = ImplicitFactorizationModel(**self.model_params) self._set_spotlight_train_data(self.train_mat) def _set_spotlight_train_data(self, train_mat): self.spotlight_dataset = spotlight_interactions_from_sparse(train_mat) def fit(self, train_obs, **fit_params): self._prep_for_fit(train_obs, **fit_params) self.model.fit(self.spotlight_dataset, verbose=self.fit_params.get('verbose', False)) def fit_partial(self, train_obs, epochs=1): self._set_epochs(epochs) if self.model is None: self.fit(train_obs) else: self.model.fit(self.spotlight_dataset) return self def _set_epochs(self, epochs): self.set_params(n_iter=epochs) def _predict_on_inds(self, user_inds, item_inds): return self.model.predict(user_inds, item_inds) def _get_item_factors(self, mode=None): return self.model._net.item_biases.weight.data.numpy().ravel(), \ self.model._net.item_embeddings.weight.data.numpy() def _get_user_factors(self, mode=None): return self.model._net.user_biases.weight.data.numpy().ravel(), \ self.model._net.user_embeddings.weight.data.numpy() def _predict_rank(self, test_mat, train_mat=None): raise NotImplementedError()
def test_implicit_serialization(data): train, test = data model = ImplicitFactorizationModel(loss='bpr', n_iter=3, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr_original = mrr_score(model, test, train=train).mean() mrr_recovered = mrr_score(_reload(model), test, train=train).mean() assert mrr_original == mrr_recovered
def factorization(train, test, out_dir=None, data_name="empty", repeats=1, verbose=False, **kwargs): """ Run experiment for dot product based models (Factorization Module) """ precisions, recalls = [], [] st = time.time() for _ in tqdm(range(repeats)): model = ImplicitFactorizationModel(**kwargs) model.fit(train, verbose=verbose) test_precision, test_recall = precision_recall_score(model, test, train, k=50) precisions.append(np.mean(test_precision)) recalls.append(np.mean(test_recall)) ts = time.time() print "*=" * 40 print "data: {} with {} repeats".format(data_name, repeats) print "Dot Product Model\n", kwargs print "Average training time: {:.4f}".format((ts - st) / repeats) print 'Test Precision@50 {:.4f}, Test Recall@50 {:.4f}'.format( np.mean(precisions), np.mean(recalls)) if out_dir is not None: with open(out_dir, "a") as f: f.write("*=" * 40 + "\n") f.write("data: {} with {} repeats".format(data_name, repeats) + "\n") f.write("Dot Product Model\n" + str(kwargs) + "\n") f.write("Average training time: {:.4f}".format((ts - st) / repeats) + "\n") f.write('Test Precision@50 {:.4f}, Test Recall@50 {:.4f}'.format( np.mean(precisions), np.mean(recalls)) + "\n")
def data(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=1, batch_size=1024, learning_rate=1e-2, l2=1e-6, random_state=RANDOM_STATE, use_cuda=CUDA) model.fit(train) return train, test, model
def test_adaptive_hinge(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='adaptive_hinge', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.07
def test_bpr(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr + EPSILON > 0.07
def test_bpr_custom_optimizer(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6): return torch.optim.Adagrad(model_params, lr=lr, weight_decay=weight_decay) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, optimizer_func=adagrad_optimizer) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.06
num_items=num_items) model = ImplicitFactorizationModel(embedding_dim=n_dimensions, n_iter=num_minor_iterations, loss='bpr', use_cuda=torch.cuda.is_available(), batch_size=batch_size, learning_rate=1e-3, l2=1e-5) test_user_ids = data.userId.unique() # keeps order of appearance for i in tqdm(range(num_major_iterations)): print("doing it number {}".format(i)) save_dir = sim_dir / str(i) if not save_dir.exists(): save_dir.mkdir() model.fit(interactions, verbose=True) torch.save(model._net.state_dict(), save_dir / "model.pkl") with torch.no_grad(): scores = np.empty((len(test_user_ids), num_items), dtype=np.float32) for e, user in enumerate(test_user_ids): rating = model.predict(user) scores[e] = rating scores = torch.as_tensor(scores) torch.save(scores, save_dir / "raw_rating_scores.pkl") mean = scores.mean(dim=1, keepdim=True) std = scores.std(dim=1, keepdim=True) centered_scores = (scores - mean) / std torch.save(centered_scores, save_dir / "centered_scores.pkl")
num_layers = int(args[1]) factor_size = int(args[0]) config["layers"] = [4 * factor_size] + [ factor_size * (2**i) for i in range(num_layers - 1, -1, -1) ] config["latent_dim"] = 2 * factor_size writer.add_text('config', str(config), 0) rep = MLP(config) else: rep = None model = ImplicitFactorizationModel( n_iter=n_iters, loss=loss, notify_loss_completion=notify_loss_completion, notify_batch_eval_completion=notify_batch_eval_completion, notify_epoch_completion=notify_epoch_completion, log_loss_interval=log_loss_interval, log_eval_interval=log_eval_interval, betas=betas, learning_rate=lr, batch_size=batch_size, random_state=np.random.RandomState(2), num_negative_samples=num_negative_samples, l2=l2, use_cuda=use_cuda, representation=rep) logger.info("Model is initialized, now fitting..") model.fit(interactions)
#ExplicitFactorizationModel emodel = ExplicitFactorizationModel(n_iter=10, embedding_dim=32, use_cuda=False) emodel.fit(exp_train, verbose=True) score_emodel = scoreAll(emodel) print(calc_reciprank(exp_validation, score_emodel, train=exp_train).mean()) #ImplicitFactorizationModel imodel = ImplicitFactorizationModel(n_iter=10, loss='bpr', embedding_dim=32, use_cuda=False) imodel.fit(exp_train, verbose=True) score_imodel_32_on_exp = scoreAll(imodel) print(calc_reciprank(exp_validation, score_imodel_32_on_exp, train=exp_train).mean()) #ImplicitFactorizationModel is more effective #tune the number of latent factors imodel_64 = ImplicitFactorizationModel(n_iter=10, loss='bpr', embedding_dim=64, use_cuda=False) imodel_64.fit(exp_train, verbose=True) print(calc_reciprank(exp_validation, scoreAll(imodel_64), train=exp_train).mean()) imodel_128 = ImplicitFactorizationModel(n_iter=10, loss='bpr', embedding_dim=128,
timestamps=timeStamps) if name == "test": dataset_test = dataset elif name == "train": dataset_train = dataset if model_mode.lower() == "ifm": model = ImplicitFactorizationModel(n_iter=n_iter) if model_mode.lower() == "efm": model = ExplicitFactorizationModel(n_iter=n_iter) if model_mode.lower() == "cnn": net = CNNNet(num_items=int(foods_items)) model = ImplicitSequenceModel(n_iter=n_iter, use_cuda=torch.cuda.is_available(), representation=net) model.fit(dataset_train) with open(save_file, 'wb') as f: pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) if model_mode.lower() == "cnn": mrr = sequence_mrr_score(model, dataset_test) else: mrr = mrr_score(model, dataset_test) print("mrr = ", len(mrr)) print("mean mrr = ", sum(mrr) / len(mrr)) rank = 1 / (sum(mrr) / len(mrr)) print("average rank = ", rank)
def objective(hyper): print(hyper) start = time.clock() if hyper['model']['type'] == 'lsh': num_hashes = int(hyper['model']['num_hash_functions']) num_layers = int(hyper['model']['num_layers']) nonlinearity = hyper['model']['nonlinearity'] residual = hyper['model']['residual'] embed = hyper['model']['embed'] gated = hyper['model']['gated'] item_embeddings = LSHEmbedding(train.num_items, int(hyper['embedding_dim']), embed=embed, gated=gated, residual_connections=residual, nonlinearity=nonlinearity, num_layers=num_layers, num_hash_functions=num_hashes) item_embeddings.fit(train.tocsr().T) user_embeddings = LSHEmbedding(train.num_users, int(hyper['embedding_dim']), embed=embed, gated=gated, residual_connections=residual, nonlinearity=nonlinearity, num_layers=num_layers, num_hash_functions=num_hashes) user_embeddings.fit(train.tocsr()) else: user_embeddings = ScaledEmbedding(train.num_users, int(hyper['embedding_dim']), padding_idx=0) item_embeddings = ScaledEmbedding(train.num_items, int(hyper['embedding_dim']), padding_idx=0) network = BilinearNet(train.num_users, train.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel( loss=hyper['loss'], n_iter=int(hyper['n_iter']), batch_size=int(hyper['batch_size']), learning_rate=hyper['learning_rate'], embedding_dim=int(hyper['embedding_dim']), l2=hyper['l2'], representation=network, use_cuda=CUDA, random_state=random_state) model.fit(train, verbose=True) elapsed = time.clock() - start print(model) validation_mrr = mrr_score(model, validation, train=train).mean() test_mrr = mrr_score(model, test, train=train.tocsr() + validation.tocsr()).mean() print('MRR {} {}'.format(validation_mrr, test_mrr)) return { 'loss': -validation_mrr, 'status': STATUS_OK, 'validation_mrr': validation_mrr, 'test_mrr': test_mrr, 'elapsed': elapsed, 'hyper': hyper }
no_components = 30 loss = 'pointwise' batch_size = 64 learning_rate = 0.1 l2 = 1e-7 epochs = 8 model = ImplicitFactorizationModel(loss=loss, embedding_dim=no_components, learning_rate=learning_rate, batch_size=batch_size, n_iter=epochs, l2=l2) model.fit(training_interactions, verbose=True) print('[ %04ds ] Model fitted' % (time.time() - start_time)) testing_set: List[Review] = Review.load_from_file(testing_set_file) seen_testing_set, unseen_testing_set = Review.extract_seen_reviews( testing_set, training_set) print(len(seen_testing_set), len(unseen_testing_set)) normalized_seen_testing_set = Review.normalize_by_user( seen_testing_set, user_avg) seen_pairs, ground_truth = Review.extract_sparse_testing_matrix_and_ground_truth( normalized_seen_testing_set) testing_user_ids = [] testing_business_ids = [] for user, business in seen_pairs: testing_user_ids.append(user_id_map[user]) testing_business_ids.append(business_id_map[business])
'learning_rate': 0.0048015875347904155, 'loss': 'adaptive_hinge', 'n_iter': 100.0, 'num_components': 3.0, 'type': 'mixture' } train, validation, test = load_data('100K', random_state) representation = EmbeddingMixtureNet( train.num_users, train.num_items, num_components=int(hyper['num_components']), embedding_dim=int(hyper['embedding_dim'])) # representation = BilinearNet(train.num_users, # train.num_items, # embedding_dim=int(hyper['embedding_dim'])) model = ImplicitFactorizationModel(loss=hyper['loss'], batch_size=int(hyper['batch_size']), representation=representation, learning_rate=hyper['learning_rate'], n_iter=int(hyper['n_iter']), l2=hyper['l2'], use_cuda=CUDA, random_state=np.random.RandomState(42)) model.fit(train, verbose=True) model._net.train(False) test_mrr = _evaluate(model, test, train.tocsr() + validation.tocsr()) print('Test MRR {}'.format(test_mrr)) print(model)