def test_hdf5_interactions_dataloader_attributes(df_for_interactions, hdf5_pandas_df_path): interactions_dl = InteractionsDataLoader(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], num_negative_samples=5) hdf5_interactions_dl = HDF5InteractionsDataLoader(hdf5_path=hdf5_pandas_df_path, user_col='user_id', item_col='item_id', num_negative_samples=5) assert hdf5_interactions_dl.num_users == interactions_dl.num_users assert hdf5_interactions_dl.num_items == interactions_dl.num_items assert hdf5_interactions_dl.num_negative_samples == interactions_dl.num_negative_samples assert hdf5_interactions_dl.num_interactions == interactions_dl.num_interactions with pytest.raises(AttributeError): hdf5_interactions_dl.mat
def __init__(self, train: interactions_like_input = None, val: interactions_like_input = None, lr: float = 1e-3, lr_scheduler_func: Optional[Callable] = None, weight_decay: float = 0.0, optimizer: Union[str, Callable] = 'adam', loss: Union[str, Callable] = 'hinge', metadata_for_loss: Optional[Dict[str, torch.tensor]] = None, metadata_for_loss_weights: Optional[Dict[str, float]] = None, load_model_path: Optional[str] = None, map_location: Optional[str] = None, **kwargs): if isinstance(train, Interactions): train = InteractionsDataLoader(interactions=train, shuffle=True) if isinstance(val, Interactions): val = InteractionsDataLoader(interactions=val, shuffle=False) super().__init__() # save datasets as class-level attributes and NOT ``hparams`` so model checkpointing / # saving can complete faster self.train_loader = train self.val_loader = val # potential issue with PyTorch Lightning is that a function cannot be saved as a # hyperparameter, so we will sidestep this by setting it as a class-level attribute # https://github.com/PyTorchLightning/pytorch-lightning/issues/2444 self.lr_scheduler_func = lr_scheduler_func self.loss = loss self.optimizer = optimizer self.bias_optimizer = kwargs.get('bias_optimizer') if load_model_path is not None: # we are loading in a previously-saved model, not creating a new one self._load_model_init_helper(load_model_path=load_model_path, map_location=map_location, **kwargs) else: if self.train_loader is None: pass elif self.val_loader is not None: assert self.train_loader.num_users == self.val_loader.num_users, ( 'Both training and val ``num_users`` must equal: ' f'{self.train_loader.num_users} != {self.val_loader.num_users}.' ) assert self.train_loader.num_items == self.val_loader.num_items, ( 'Both training and val ``num_items`` must equal: ' f'{self.train_loader.num_items} != {self.val_loader.num_items}.' ) num_negative_samples_error = ( 'Training and val ``num_negative_samples`` property must both equal ``1``' f' or both be greater than ``1``, not: {self.train_loader.num_items} and' f' {self.val_loader.num_items}, respectively.') if self.train_loader.num_negative_samples == 1: assert self.val_loader.num_negative_samples == 1, num_negative_samples_error elif self.train_loader.num_negative_samples > 1: assert self.val_loader.num_negative_samples > 1, num_negative_samples_error else: raise ValueError( '``self.train_loader.num_negative_samples`` must be greater than ``0``, not' f' {self.train_loader.num_negative_samples}.') # saves all passed-in parameters init_args = get_init_arguments( exclude=['train', 'val', 'item_metadata', 'trained_model'], verbose=False, ) self.save_hyperparameters(init_args, *kwargs.keys()) self.hparams.num_users = self.train_loader.num_users self.hparams.num_items = self.train_loader.num_items self.hparams.n_epochs_completed_ = 0 self._configure_loss() # check weight decay and sparsity if hasattr(self.hparams, 'sparse'): if self.hparams.sparse and self.hparams.weight_decay != 0: warnings.warn( textwrap.dedent(f''' ``weight_decay`` value must be 0 when ``sparse`` is flagged, not {self.hparams.weight_decay}. Setting to 0. ''').replace('\n', ' ').strip()) self.hparams.weight_decay = 0.0 # set up the actual model self._setup_model(**kwargs)
def test_all_data_loaders_output_equal(df_for_interactions, hdf5_pandas_df_path, tmpdir, capfd): common_data_loader_kwargs = { 'num_negative_samples': 4, 'batch_size': 5, 'shuffle': False, 'drop_last': False, } interactions_dl = InteractionsDataLoader(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], **common_data_loader_kwargs) approx_dl = ( ApproximateNegativeSamplingInteractionsDataLoader(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], **common_data_loader_kwargs) ) hdf5_interactions_dl = HDF5InteractionsDataLoader(hdf5_path=hdf5_pandas_df_path, user_col='user_id', item_col='item_id', **common_data_loader_kwargs) expected_repr = ( '{} object with 12 interactions between 6 users and 10 items, returning 4 negative samples' ' per interaction in non-shuffled batches of size 5.' ) assert str(interactions_dl) == expected_repr.format(str(type(interactions_dl).__name__)) assert str(approx_dl) == expected_repr.format(str(type(approx_dl).__name__)) assert ( str(hdf5_interactions_dl) == expected_repr.format(str(type(hdf5_interactions_dl).__name__)) ) assert interactions_dl.num_users == approx_dl.num_users == hdf5_interactions_dl.num_users assert interactions_dl.num_items == approx_dl.num_items == hdf5_interactions_dl.num_items assert ( interactions_dl.num_interactions == approx_dl.num_interactions == hdf5_interactions_dl.num_interactions ) # get all batches from every DataLoader, add them to a list for comparison below def get_all_batches_from_DataLoader(dataloader, batch_size): all_batches = list() for idx, batch in enumerate(dataloader): assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1]) if idx < len(dataloader) - 1: assert len(batch[0][0]) == batch_size all_batches.append(batch) return all_batches interactions_batches = get_all_batches_from_DataLoader(interactions_dl, batch_size=interactions_dl.batch_size) approximate_batches = get_all_batches_from_DataLoader( approx_dl, batch_size=approx_dl.approximate_negative_sampler.batch_size, ) hdf5_batches = get_all_batches_from_DataLoader( hdf5_interactions_dl, batch_size=hdf5_interactions_dl.hdf5_sampler.batch_size, ) for idx in range(len(interactions_batches)): assert ( interactions_batches[idx][0][0].tolist() == approximate_batches[idx][0][0].tolist() == hdf5_batches[idx][0][0].tolist() ) assert ( interactions_batches[idx][0][1].tolist() == approximate_batches[idx][0][1].tolist() == hdf5_batches[idx][0][1].tolist() ) # random negative samples will never be exactly equal assert ( interactions_batches[idx][1].shape == approximate_batches[idx][1].shape == hdf5_batches[idx][1].shape ) # test that our last batch is less than the specified batch size and that is okay for HDF5 data assert len(interactions_batches[-1][0][0]) < interactions_dl.batch_size assert len(approximate_batches[-1][0][0]) < approx_dl.approximate_negative_sampler.batch_size assert len(hdf5_batches[-1][0][0]) < hdf5_interactions_dl.hdf5_sampler.batch_size
def models_trained_for_one_step(request, train_val_implicit_data, movielens_metadata_df, movielens_implicit_df, train_val_implicit_pandas_data, gpu_count): train, val = train_val_implicit_data if request.param == 'mf_hdf5': # create, fit, and return the model all at once so we can close the HDF5 file train_pandas_df, val_pandas_df = train_val_implicit_pandas_data with tempfile.TemporaryDirectory() as temp_dir: pandas_df_to_hdf5(df=train_pandas_df, out_path=os.path.join(temp_dir, 'train.h5'), key='interactions') pandas_df_to_hdf5(df=val_pandas_df, out_path=os.path.join(temp_dir, 'val.h5'), key='interactions') train_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'train.h5'), user_col='user_id', item_col='item_id', num_users=train.num_users, num_items=train.num_items, batch_size=1024, shuffle=True) val_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'val.h5'), user_col='user_id', item_col='item_id', num_users=val.num_users, num_items=val.num_items, batch_size=1024, shuffle=False) model = MatrixFactorizationModel(train=train_loader, val=val_loader, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr', sparse=False) model_trainer = CollieTrainer(model=model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) model_trainer.fit(model) model.freeze() return model elif request.param == 'sparse_mf': model = MatrixFactorizationModel(train=train, val=val, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='sparse_adam', bias_optimizer='sgd', weight_decay=0, loss='hinge', sparse=True) elif request.param == 'mf_no_val': model = MatrixFactorizationModel(train=train, val=None) elif request.param == 'mf_non_approximate' or request.param == 'mf_approximate': if request.param == 'mf_non_approximate': train_loader = InteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False) else: train_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True) val_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False) model = MatrixFactorizationModel(train=train_loader, val=val_loader, embedding_dim=15, dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr', sparse=False) elif request.param == 'mf_with_y_range': model = MatrixFactorizationModel(train=train, val=val, y_range=(0, 4)) elif request.param == 'nonlinear_mf': model = NonlinearMatrixFactorizationModel(train=train, val=val, user_embedding_dim=15, item_embedding_dim=15, user_dense_layers_dims=[15, 10], item_dense_layers_dims=[15, 10], embedding_dropout_p=0.05, dense_dropout_p=0.1, lr=1e-1, bias_lr=1e-2, optimizer='adam', bias_optimizer='sgd', weight_decay=1e-7, loss='bpr') elif request.param == 'nonlinear_mf_with_y_range': model = NonlinearMatrixFactorizationModel(train=train, val=val, y_range=(0, 4)) elif request.param == 'neucf': model = NeuralCollaborativeFiltering(train=train, val=val, embedding_dim=10, num_layers=1, dropout_p=0.1, lr=1e-3, weight_decay=0., optimizer='adam', loss='adaptive') elif request.param == 'neucf_sigmoid': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='sigmoid') elif request.param == 'neucf_relu': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='relu') elif request.param == 'neucf_leaky_rulu': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer='leaky_relu') elif request.param == 'neucf_custom': model = NeuralCollaborativeFiltering(train=train, val=val, final_layer=torch.tanh) elif ( request.param == 'hybrid_pretrained' or request.param == 'hybrid_pretrained_metadata_layers' ): implicit_model = MatrixFactorizationModel(train=train, val=val, embedding_dim=10, lr=1e-1, optimizer='adam') implicit_model_trainer = CollieTrainer(model=implicit_model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) implicit_model_trainer.fit(implicit_model) implicit_model.freeze() genres = ( torch.tensor(movielens_metadata_df[ [c for c in movielens_metadata_df.columns if 'genre' in c] ].values) .topk(1) .indices .view(-1) ) if request.param == 'hybrid_pretrained_metadata_layers': metadata_layers_dims = [16, 12] else: metadata_layers_dims = None model_frozen = HybridPretrainedModel(train=train, val=val, item_metadata=movielens_metadata_df, trained_model=implicit_model, metadata_layers_dims=metadata_layers_dims, freeze_embeddings=True, dropout_p=0.15, loss='warp', lr=.01, optimizer=torch.optim.Adam, metadata_for_loss={'genre': genres}, metadata_for_loss_weights={'genre': .4}, weight_decay=0.0) model_frozen_trainer = CollieTrainer(model=model_frozen, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) model_frozen_trainer.fit(model_frozen) model = HybridPretrainedModel(train=train, val=val, item_metadata=movielens_metadata_df, trained_model=implicit_model, metadata_layers_dims=metadata_layers_dims, freeze_embeddings=False, dropout_p=0.15, loss='bpr', lr=1e-4, optimizer=torch.optim.Adam, metadata_for_loss={'genre': genres}, metadata_for_loss_weights={'genre': .4}, weight_decay=0.0) model.load_from_hybrid_model(model_frozen) model_trainer = CollieTrainer(model=model, gpus=gpu_count, max_steps=1, deterministic=True, logger=False, checkpoint_callback=False) if request.param == 'mf_no_val': with pytest.warns(UserWarning): model_trainer.fit(model) else: model_trainer.fit(model) model.freeze() return model
def test_all_data_loaders_output_equal(df_for_interactions, hdf5_pandas_df_path, tmpdir, capfd): common_data_loader_kwargs = { 'num_negative_samples': 4, 'batch_size': 5, 'shuffle': False, 'drop_last': False, } interactions_dl = InteractionsDataLoader( users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], **common_data_loader_kwargs) approx_dl = (ApproximateNegativeSamplingInteractionsDataLoader( users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], **common_data_loader_kwargs)) hdf5_interactions_dl = HDF5InteractionsDataLoader( hdf5_path=hdf5_pandas_df_path, user_col='user_id', item_col='item_id', **common_data_loader_kwargs) out, _ = capfd.readouterr() assert '``meta`` key not found - generating ``num_users`` and ``num_items``' in out assert interactions_dl.num_users == approx_dl.num_users == hdf5_interactions_dl.num_users assert interactions_dl.num_items == approx_dl.num_items == hdf5_interactions_dl.num_items assert (interactions_dl.num_interactions == approx_dl.num_interactions == hdf5_interactions_dl.num_interactions) interactions_dl_first_batch = next(iter(interactions_dl)) approx_dl_first_batch = next(iter(approx_dl)) hdf5_interactions_dl_first_batch = next(iter(hdf5_interactions_dl)) # test the shapes of outputs are equal for idx in range(len(interactions_dl_first_batch[0])): assert (len(interactions_dl_first_batch[0][idx]) == len( approx_dl_first_batch[0][idx]) == len( hdf5_interactions_dl_first_batch[0][idx]) == common_data_loader_kwargs['batch_size']) assert (interactions_dl_first_batch[1].shape == approx_dl_first_batch[1].shape == hdf5_interactions_dl_first_batch[1].shape == (common_data_loader_kwargs['batch_size'], common_data_loader_kwargs['num_negative_samples'])) # get all batches from every DataLoader, add them to a list for comparison below interactions_batches = list() for idx, batch in enumerate(interactions_dl): assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1]) if idx < len(interactions_dl) - 1: assert len(batch[0][0]) == interactions_dl.batch_size interactions_batches.append(batch) approximate_batches = list() for idx, batch in enumerate(approx_dl): assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1]) if idx < len(approx_dl) - 1: assert len( batch[0] [0]) == approx_dl.approximate_negative_sampler.batch_size approximate_batches.append(batch) hdf5_batches = list() for idx, batch in enumerate(hdf5_interactions_dl): assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1]) if idx < len(hdf5_interactions_dl) - 1: assert len( batch[0][0]) == hdf5_interactions_dl.hdf5_sampler.batch_size hdf5_batches.append(batch) for idx in range(len(interactions_batches)): assert (interactions_batches[idx][0][0].tolist() == approximate_batches[idx][0][0].tolist() == hdf5_batches[idx][0][0].tolist()) assert (interactions_batches[idx][0][1].tolist() == approximate_batches[idx][0][1].tolist() == hdf5_batches[idx][0][1].tolist()) # random negative samples will never be exactly equal assert (interactions_batches[idx][1].shape == approximate_batches[idx][1].shape == hdf5_batches[idx][1].shape) interactions_last_batch = interactions_batches[-1] approximate_last_batch = approximate_batches[-1] hdf5_last_batch = hdf5_batches[-1] # test that our last batch is less than the specified batch size and that is okay for HDF5 data assert len(interactions_last_batch[0][0]) < interactions_dl.batch_size assert len(approximate_last_batch[0] [0]) < approx_dl.approximate_negative_sampler.batch_size assert len( hdf5_last_batch[0][0]) < hdf5_interactions_dl.hdf5_sampler.batch_size
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie_recs/movielens/run.py --epochs 20 Parameters ------------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer( model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')