示例#1
0
def test_hdf5_interactions_dataloader_attributes(df_for_interactions, hdf5_pandas_df_path):
    interactions_dl = InteractionsDataLoader(users=df_for_interactions['user_id'],
                                             items=df_for_interactions['item_id'],
                                             num_negative_samples=5)

    hdf5_interactions_dl = HDF5InteractionsDataLoader(hdf5_path=hdf5_pandas_df_path,
                                                      user_col='user_id',
                                                      item_col='item_id',
                                                      num_negative_samples=5)

    assert hdf5_interactions_dl.num_users == interactions_dl.num_users
    assert hdf5_interactions_dl.num_items == interactions_dl.num_items
    assert hdf5_interactions_dl.num_negative_samples == interactions_dl.num_negative_samples
    assert hdf5_interactions_dl.num_interactions == interactions_dl.num_interactions

    with pytest.raises(AttributeError):
        hdf5_interactions_dl.mat
示例#2
0
    def __init__(self,
                 train: interactions_like_input = None,
                 val: interactions_like_input = None,
                 lr: float = 1e-3,
                 lr_scheduler_func: Optional[Callable] = None,
                 weight_decay: float = 0.0,
                 optimizer: Union[str, Callable] = 'adam',
                 loss: Union[str, Callable] = 'hinge',
                 metadata_for_loss: Optional[Dict[str, torch.tensor]] = None,
                 metadata_for_loss_weights: Optional[Dict[str, float]] = None,
                 load_model_path: Optional[str] = None,
                 map_location: Optional[str] = None,
                 **kwargs):
        if isinstance(train, Interactions):
            train = InteractionsDataLoader(interactions=train, shuffle=True)
        if isinstance(val, Interactions):
            val = InteractionsDataLoader(interactions=val, shuffle=False)

        super().__init__()

        # save datasets as class-level attributes and NOT ``hparams`` so model checkpointing /
        # saving can complete faster
        self.train_loader = train
        self.val_loader = val

        # potential issue with PyTorch Lightning is that a function cannot be saved as a
        # hyperparameter, so we will sidestep this by setting it as a class-level attribute
        # https://github.com/PyTorchLightning/pytorch-lightning/issues/2444
        self.lr_scheduler_func = lr_scheduler_func
        self.loss = loss
        self.optimizer = optimizer
        self.bias_optimizer = kwargs.get('bias_optimizer')

        if load_model_path is not None:
            # we are loading in a previously-saved model, not creating a new one
            self._load_model_init_helper(load_model_path=load_model_path,
                                         map_location=map_location,
                                         **kwargs)
        else:
            if self.train_loader is None:
                pass
            elif self.val_loader is not None:
                assert self.train_loader.num_users == self.val_loader.num_users, (
                    'Both training and val ``num_users`` must equal: '
                    f'{self.train_loader.num_users} != {self.val_loader.num_users}.'
                )
                assert self.train_loader.num_items == self.val_loader.num_items, (
                    'Both training and val ``num_items`` must equal: '
                    f'{self.train_loader.num_items} != {self.val_loader.num_items}.'
                )

                num_negative_samples_error = (
                    'Training and val ``num_negative_samples`` property must both equal ``1``'
                    f' or both be greater than ``1``, not: {self.train_loader.num_items} and'
                    f' {self.val_loader.num_items}, respectively.')
                if self.train_loader.num_negative_samples == 1:
                    assert self.val_loader.num_negative_samples == 1, num_negative_samples_error
                elif self.train_loader.num_negative_samples > 1:
                    assert self.val_loader.num_negative_samples > 1, num_negative_samples_error
                else:
                    raise ValueError(
                        '``self.train_loader.num_negative_samples`` must be greater than ``0``, not'
                        f' {self.train_loader.num_negative_samples}.')

            # saves all passed-in parameters
            init_args = get_init_arguments(
                exclude=['train', 'val', 'item_metadata', 'trained_model'],
                verbose=False,
            )

            self.save_hyperparameters(init_args, *kwargs.keys())

            self.hparams.num_users = self.train_loader.num_users
            self.hparams.num_items = self.train_loader.num_items
            self.hparams.n_epochs_completed_ = 0

            self._configure_loss()

            # check weight decay and sparsity
            if hasattr(self.hparams, 'sparse'):
                if self.hparams.sparse and self.hparams.weight_decay != 0:
                    warnings.warn(
                        textwrap.dedent(f'''
                            ``weight_decay`` value must be 0 when ``sparse`` is flagged, not
                            {self.hparams.weight_decay}. Setting to 0.
                            ''').replace('\n', ' ').strip())
                    self.hparams.weight_decay = 0.0

            # set up the actual model
            self._setup_model(**kwargs)
示例#3
0
def test_all_data_loaders_output_equal(df_for_interactions, hdf5_pandas_df_path, tmpdir, capfd):
    common_data_loader_kwargs = {
        'num_negative_samples': 4,
        'batch_size': 5,
        'shuffle': False,
        'drop_last': False,
    }

    interactions_dl = InteractionsDataLoader(users=df_for_interactions['user_id'],
                                             items=df_for_interactions['item_id'],
                                             **common_data_loader_kwargs)
    approx_dl = (
        ApproximateNegativeSamplingInteractionsDataLoader(users=df_for_interactions['user_id'],
                                                          items=df_for_interactions['item_id'],
                                                          **common_data_loader_kwargs)
    )
    hdf5_interactions_dl = HDF5InteractionsDataLoader(hdf5_path=hdf5_pandas_df_path,
                                                      user_col='user_id',
                                                      item_col='item_id',
                                                      **common_data_loader_kwargs)

    expected_repr = (
        '{} object with 12 interactions between 6 users and 10 items, returning 4 negative samples'
        ' per interaction in non-shuffled batches of size 5.'
    )

    assert str(interactions_dl) == expected_repr.format(str(type(interactions_dl).__name__))
    assert str(approx_dl) == expected_repr.format(str(type(approx_dl).__name__))
    assert (
        str(hdf5_interactions_dl) == expected_repr.format(str(type(hdf5_interactions_dl).__name__))
    )

    assert interactions_dl.num_users == approx_dl.num_users == hdf5_interactions_dl.num_users
    assert interactions_dl.num_items == approx_dl.num_items == hdf5_interactions_dl.num_items
    assert (
        interactions_dl.num_interactions
        == approx_dl.num_interactions
        == hdf5_interactions_dl.num_interactions
    )

    # get all batches from every DataLoader, add them to a list for comparison below
    def get_all_batches_from_DataLoader(dataloader, batch_size):
        all_batches = list()
        for idx, batch in enumerate(dataloader):
            assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1])

            if idx < len(dataloader) - 1:
                assert len(batch[0][0]) == batch_size

            all_batches.append(batch)

        return all_batches

    interactions_batches = get_all_batches_from_DataLoader(interactions_dl,
                                                           batch_size=interactions_dl.batch_size)
    approximate_batches = get_all_batches_from_DataLoader(
        approx_dl,
        batch_size=approx_dl.approximate_negative_sampler.batch_size,
    )
    hdf5_batches = get_all_batches_from_DataLoader(
        hdf5_interactions_dl,
        batch_size=hdf5_interactions_dl.hdf5_sampler.batch_size,
    )

    for idx in range(len(interactions_batches)):
        assert (
            interactions_batches[idx][0][0].tolist()
            == approximate_batches[idx][0][0].tolist()
            == hdf5_batches[idx][0][0].tolist()
        )
        assert (
            interactions_batches[idx][0][1].tolist()
            == approximate_batches[idx][0][1].tolist()
            == hdf5_batches[idx][0][1].tolist()
        )
        # random negative samples will never be exactly equal
        assert (
            interactions_batches[idx][1].shape
            == approximate_batches[idx][1].shape
            == hdf5_batches[idx][1].shape
        )

    # test that our last batch is less than the specified batch size and that is okay for HDF5 data
    assert len(interactions_batches[-1][0][0]) < interactions_dl.batch_size
    assert len(approximate_batches[-1][0][0]) < approx_dl.approximate_negative_sampler.batch_size
    assert len(hdf5_batches[-1][0][0]) < hdf5_interactions_dl.hdf5_sampler.batch_size
示例#4
0
def models_trained_for_one_step(request,
                                train_val_implicit_data,
                                movielens_metadata_df,
                                movielens_implicit_df,
                                train_val_implicit_pandas_data,
                                gpu_count):
    train, val = train_val_implicit_data

    if request.param == 'mf_hdf5':
        # create, fit, and return the model all at once so we can close the HDF5 file
        train_pandas_df, val_pandas_df = train_val_implicit_pandas_data

        with tempfile.TemporaryDirectory() as temp_dir:
            pandas_df_to_hdf5(df=train_pandas_df,
                              out_path=os.path.join(temp_dir, 'train.h5'),
                              key='interactions')
            pandas_df_to_hdf5(df=val_pandas_df,
                              out_path=os.path.join(temp_dir, 'val.h5'),
                              key='interactions')

            train_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'train.h5'),
                                                      user_col='user_id',
                                                      item_col='item_id',
                                                      num_users=train.num_users,
                                                      num_items=train.num_items,
                                                      batch_size=1024,
                                                      shuffle=True)
            val_loader = HDF5InteractionsDataLoader(hdf5_path=os.path.join(temp_dir, 'val.h5'),
                                                    user_col='user_id',
                                                    item_col='item_id',
                                                    num_users=val.num_users,
                                                    num_items=val.num_items,
                                                    batch_size=1024,
                                                    shuffle=False)

            model = MatrixFactorizationModel(train=train_loader,
                                             val=val_loader,
                                             embedding_dim=15,
                                             dropout_p=0.1,
                                             lr=1e-1,
                                             bias_lr=1e-2,
                                             optimizer='adam',
                                             bias_optimizer='sgd',
                                             weight_decay=1e-7,
                                             loss='bpr',
                                             sparse=False)

            model_trainer = CollieTrainer(model=model,
                                          gpus=gpu_count,
                                          max_steps=1,
                                          deterministic=True,
                                          logger=False,
                                          checkpoint_callback=False)

            model_trainer.fit(model)
            model.freeze()

            return model

    elif request.param == 'sparse_mf':
        model = MatrixFactorizationModel(train=train,
                                         val=val,
                                         embedding_dim=15,
                                         dropout_p=0.1,
                                         lr=1e-1,
                                         bias_lr=1e-2,
                                         optimizer='sparse_adam',
                                         bias_optimizer='sgd',
                                         weight_decay=0,
                                         loss='hinge',
                                         sparse=True)
    elif request.param == 'mf_no_val':
        model = MatrixFactorizationModel(train=train, val=None)
    elif request.param == 'mf_non_approximate' or request.param == 'mf_approximate':
        if request.param == 'mf_non_approximate':
            train_loader = InteractionsDataLoader(interactions=train, batch_size=1024, shuffle=True)
            val_loader = InteractionsDataLoader(interactions=val, batch_size=1024, shuffle=False)
        else:
            train_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=train,
                                                                             batch_size=1024,
                                                                             shuffle=True)
            val_loader = ApproximateNegativeSamplingInteractionsDataLoader(interactions=val,
                                                                           batch_size=1024,
                                                                           shuffle=False)

        model = MatrixFactorizationModel(train=train_loader,
                                         val=val_loader,
                                         embedding_dim=15,
                                         dropout_p=0.1,
                                         lr=1e-1,
                                         bias_lr=1e-2,
                                         optimizer='adam',
                                         bias_optimizer='sgd',
                                         weight_decay=1e-7,
                                         loss='bpr',
                                         sparse=False)
    elif request.param == 'mf_with_y_range':
        model = MatrixFactorizationModel(train=train,
                                         val=val,
                                         y_range=(0, 4))
    elif request.param == 'nonlinear_mf':
        model = NonlinearMatrixFactorizationModel(train=train,
                                                  val=val,
                                                  user_embedding_dim=15,
                                                  item_embedding_dim=15,
                                                  user_dense_layers_dims=[15, 10],
                                                  item_dense_layers_dims=[15, 10],
                                                  embedding_dropout_p=0.05,
                                                  dense_dropout_p=0.1,
                                                  lr=1e-1,
                                                  bias_lr=1e-2,
                                                  optimizer='adam',
                                                  bias_optimizer='sgd',
                                                  weight_decay=1e-7,
                                                  loss='bpr')
    elif request.param == 'nonlinear_mf_with_y_range':
        model = NonlinearMatrixFactorizationModel(train=train,
                                                  val=val,
                                                  y_range=(0, 4))
    elif request.param == 'neucf':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             embedding_dim=10,
                                             num_layers=1,
                                             dropout_p=0.1,
                                             lr=1e-3,
                                             weight_decay=0.,
                                             optimizer='adam',
                                             loss='adaptive')
    elif request.param == 'neucf_sigmoid':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='sigmoid')
    elif request.param == 'neucf_relu':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='relu')
    elif request.param == 'neucf_leaky_rulu':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer='leaky_relu')
    elif request.param == 'neucf_custom':
        model = NeuralCollaborativeFiltering(train=train,
                                             val=val,
                                             final_layer=torch.tanh)
    elif (
        request.param == 'hybrid_pretrained' or request.param == 'hybrid_pretrained_metadata_layers'
    ):
        implicit_model = MatrixFactorizationModel(train=train,
                                                  val=val,
                                                  embedding_dim=10,
                                                  lr=1e-1,
                                                  optimizer='adam')
        implicit_model_trainer = CollieTrainer(model=implicit_model,
                                               gpus=gpu_count,
                                               max_steps=1,
                                               deterministic=True,
                                               logger=False,
                                               checkpoint_callback=False)
        implicit_model_trainer.fit(implicit_model)
        implicit_model.freeze()

        genres = (
            torch.tensor(movielens_metadata_df[
                [c for c in movielens_metadata_df.columns if 'genre' in c]
            ].values)
            .topk(1)
            .indices
            .view(-1)
        )

        if request.param == 'hybrid_pretrained_metadata_layers':
            metadata_layers_dims = [16, 12]
        else:
            metadata_layers_dims = None

        model_frozen = HybridPretrainedModel(train=train,
                                             val=val,
                                             item_metadata=movielens_metadata_df,
                                             trained_model=implicit_model,
                                             metadata_layers_dims=metadata_layers_dims,
                                             freeze_embeddings=True,
                                             dropout_p=0.15,
                                             loss='warp',
                                             lr=.01,
                                             optimizer=torch.optim.Adam,
                                             metadata_for_loss={'genre': genres},
                                             metadata_for_loss_weights={'genre': .4},
                                             weight_decay=0.0)
        model_frozen_trainer = CollieTrainer(model=model_frozen,
                                             gpus=gpu_count,
                                             max_steps=1,
                                             deterministic=True,
                                             logger=False,
                                             checkpoint_callback=False)
        model_frozen_trainer.fit(model_frozen)

        model = HybridPretrainedModel(train=train,
                                      val=val,
                                      item_metadata=movielens_metadata_df,
                                      trained_model=implicit_model,
                                      metadata_layers_dims=metadata_layers_dims,
                                      freeze_embeddings=False,
                                      dropout_p=0.15,
                                      loss='bpr',
                                      lr=1e-4,
                                      optimizer=torch.optim.Adam,
                                      metadata_for_loss={'genre': genres},
                                      metadata_for_loss_weights={'genre': .4},
                                      weight_decay=0.0)
        model.load_from_hybrid_model(model_frozen)

    model_trainer = CollieTrainer(model=model,
                                  gpus=gpu_count,
                                  max_steps=1,
                                  deterministic=True,
                                  logger=False,
                                  checkpoint_callback=False)

    if request.param == 'mf_no_val':
        with pytest.warns(UserWarning):
            model_trainer.fit(model)
    else:
        model_trainer.fit(model)

    model.freeze()

    return model
示例#5
0
def test_all_data_loaders_output_equal(df_for_interactions,
                                       hdf5_pandas_df_path, tmpdir, capfd):
    common_data_loader_kwargs = {
        'num_negative_samples': 4,
        'batch_size': 5,
        'shuffle': False,
        'drop_last': False,
    }

    interactions_dl = InteractionsDataLoader(
        users=df_for_interactions['user_id'],
        items=df_for_interactions['item_id'],
        **common_data_loader_kwargs)
    approx_dl = (ApproximateNegativeSamplingInteractionsDataLoader(
        users=df_for_interactions['user_id'],
        items=df_for_interactions['item_id'],
        **common_data_loader_kwargs))
    hdf5_interactions_dl = HDF5InteractionsDataLoader(
        hdf5_path=hdf5_pandas_df_path,
        user_col='user_id',
        item_col='item_id',
        **common_data_loader_kwargs)

    out, _ = capfd.readouterr()
    assert '``meta`` key not found - generating ``num_users`` and ``num_items``' in out

    assert interactions_dl.num_users == approx_dl.num_users == hdf5_interactions_dl.num_users
    assert interactions_dl.num_items == approx_dl.num_items == hdf5_interactions_dl.num_items
    assert (interactions_dl.num_interactions == approx_dl.num_interactions ==
            hdf5_interactions_dl.num_interactions)

    interactions_dl_first_batch = next(iter(interactions_dl))
    approx_dl_first_batch = next(iter(approx_dl))
    hdf5_interactions_dl_first_batch = next(iter(hdf5_interactions_dl))

    # test the shapes of outputs are equal
    for idx in range(len(interactions_dl_first_batch[0])):
        assert (len(interactions_dl_first_batch[0][idx]) == len(
            approx_dl_first_batch[0][idx]) == len(
                hdf5_interactions_dl_first_batch[0][idx]) ==
                common_data_loader_kwargs['batch_size'])

    assert (interactions_dl_first_batch[1].shape ==
            approx_dl_first_batch[1].shape ==
            hdf5_interactions_dl_first_batch[1].shape ==
            (common_data_loader_kwargs['batch_size'],
             common_data_loader_kwargs['num_negative_samples']))

    # get all batches from every DataLoader, add them to a list for comparison below
    interactions_batches = list()
    for idx, batch in enumerate(interactions_dl):
        assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1])
        if idx < len(interactions_dl) - 1:
            assert len(batch[0][0]) == interactions_dl.batch_size

        interactions_batches.append(batch)

    approximate_batches = list()
    for idx, batch in enumerate(approx_dl):
        assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1])
        if idx < len(approx_dl) - 1:
            assert len(
                batch[0]
                [0]) == approx_dl.approximate_negative_sampler.batch_size

        approximate_batches.append(batch)

    hdf5_batches = list()
    for idx, batch in enumerate(hdf5_interactions_dl):
        assert len(batch[0][0]) == len(batch[0][1]) == len(batch[1])
        if idx < len(hdf5_interactions_dl) - 1:
            assert len(
                batch[0][0]) == hdf5_interactions_dl.hdf5_sampler.batch_size

        hdf5_batches.append(batch)

    for idx in range(len(interactions_batches)):
        assert (interactions_batches[idx][0][0].tolist() ==
                approximate_batches[idx][0][0].tolist() ==
                hdf5_batches[idx][0][0].tolist())
        assert (interactions_batches[idx][0][1].tolist() ==
                approximate_batches[idx][0][1].tolist() ==
                hdf5_batches[idx][0][1].tolist())
        # random negative samples will never be exactly equal
        assert (interactions_batches[idx][1].shape ==
                approximate_batches[idx][1].shape ==
                hdf5_batches[idx][1].shape)

    interactions_last_batch = interactions_batches[-1]
    approximate_last_batch = approximate_batches[-1]
    hdf5_last_batch = hdf5_batches[-1]

    # test that our last batch is less than the specified batch size and that is okay for HDF5 data
    assert len(interactions_last_batch[0][0]) < interactions_dl.batch_size
    assert len(approximate_last_batch[0]
               [0]) < approx_dl.approximate_negative_sampler.batch_size
    assert len(
        hdf5_last_batch[0][0]) < hdf5_interactions_dl.hdf5_sampler.batch_size
示例#6
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')