예제 #1
0
def load_data(dataset, random_state):

    dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset, random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
    def obtener_interacciones_gui(self, ruta_ratings, sep_ratings, encoding_ratings):
        """
        Método obtener_interacciones_gui. Obtiene las interacciones necesarias para la creación de los modelos de Spotlight.

        Este método solo se utiliza en la interfaz web.

        Parameters
        ----------

        ruta_ratings: str
            ruta del archivo que contiene las valoraciones.
        sep_ratings: str
            separador utilizado en el archivo de valoraiones.
        encoding_ratings: str
            encoding utilizado en el archivo de valoraciones.
        """

        global train, test
        
        # Se obtiene el dataframe de valoraciones
        ratings_df = Entrada.leer_csv(ruta_ratings, sep_ratings, encoding_ratings)
        ratings_df.sort_values([ratings_df.columns.values[0], ratings_df.columns.values[1]], inplace=True)

        # Se obtienen arrays con los ids de los usuarios y de los ítems
        users_ids = np.asarray(ratings_df[ratings_df.columns.values[0]].tolist(), dtype=np.int32)         
        items_ids = np.asarray(ratings_df[ratings_df.columns.values[1]].tolist(), dtype=np.int32)
        
        # Se transforma el dataframe de valoraciones en interacciones que puedan ser utilzadas por los modelos
        if self.opcion_time == 1:
            timestamps = np.asarray(ratings_df[ratings_df.columns.values[3]].tolist(), dtype=np.int32)
            if self.opcion_modelo == 1:
                ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32)
                interacciones = Interactions(users_ids, items_ids, ratings=ratings, timestamps=timestamps)
                train, test = random_train_test_split(interacciones)
            else:
                interacciones = Interactions(users_ids, items_ids, timestamps=timestamps)
                train, test = random_train_test_split(interacciones)
                if self.opcion_modelo == 3:
                    train = train.to_sequence()
                    test = test.to_sequence()
        else:
            if self.opcion_modelo == 1:
                ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32)
                interacciones = Interactions(users_ids, items_ids, ratings=ratings)
            else:
                interacciones = Interactions(users_ids, items_ids)
            train, test = random_train_test_split(interacciones)
            
        # Se guardan las interacciones de entrenamiento y test
        print("Guarda las interacciones de train")
        guardar_datos_pickle(train, 'las interacciones de entrenamiento')
        print("Guarda las interacciones de test")
        guardar_datos_pickle(test, 'las interacciones de test')
예제 #3
0
def dataSplit(train, numberDataSplits):
    arrayOfSplits = []
    split1, split2 = random_train_test_split(train, 1.0 / numberDataSplits)
    arrayOfSplits += [split2]
    splitLength = len(split2.ratings)

    while (splitLength < len(split1.ratings)):
        splitPercentage = splitLength / len(split1.ratings)
        split1, split2 = random_train_test_split(split1, splitPercentage)
        arrayOfSplits += [split2]

    arrayOfSplits += [split1]
    return arrayOfSplits
    def cross_validation(self, interactions: Interactions) -> tuple:
        """Randomly split interactions between training and testing.

        This function takes an interaction set and splits it into two disjoint sets,
        a training set and a test set.

        Args:
            interactions (spotlight.interactions.Interactions): Matrix of user-item interactions.

        Returns:
            tuple: (spotlight.interactions.Interactions, spotlight.interactions.Interactions),
                A tuple of (train data, test data).

        """

        def interactions_to_sequence(f_train: Interactions, f_test: Interactions):
            train, test = f_train.to_sequence(), f_test.to_sequence()
            return train, test

        logger = logging.getLogger()
        train, test = random_train_test_split(interactions)
        if self._models in ('S_POOL','S_CNN', 'S_LSTM'):
            train, test = interactions_to_sequence(train, test)

        logger.info('Split into \n {} and \n {}.'.format(train, test))
        return (
            train,
            test
        )
def get_train_test_set(interaction):
    """

    :param interaction: Our interaction object (input data)
    :return: Our interaction object (input data) splitted into train and test sets.
    """
    return random_train_test_split(interaction)
예제 #6
0
def build_mean_baseline_model(data):
    train_data, test_data = random_train_test_split(data)
    df = pd.DataFrame(train_data.tocoo().toarray())
    print(df.columns)
    avg_ratings, mean_avg = train(train_data)
    rmse = predict(test_data, avg_ratings, mean_avg)
    return rmse
예제 #7
0
def test_bloom(compression_ratio, expected_rmse):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-5,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    rmse = rmse_score(model, test)
    print(rmse)

    assert rmse - EPSILON < expected_rmse
예제 #8
0
def train_initial_model():
    dataset = get_movielens_dataset(variant='100K')

    train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                    embedding_dim=128,  # latent dimensionality
                                    n_iter=10,  # number of epochs of training
                                    batch_size=1024,  # minibatch size
                                    l2=1e-9,  # strength of L2 regularization
                                    learning_rate=1e-3,
                                    use_cuda=torch.cuda.is_available())

    print('Fitting the model')

    model.fit(train, verbose=True)
    print(type(model))

    model_file = open('models/filmclub.model', 'wb')
    pickle.dump(model, model_file)
    model_file.close()

    dataset.num_users = 1000000

    dataset_file = open('data/dataset.pkl', 'wb')
    pickle.dump(dataset, dataset_file)
    dataset_file.close()

    train_rmse = rmse_score(model, train)
    test_rmse = rmse_score(model, test)

    print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
예제 #9
0
def test_bpr_bloom(compression_ratio, expected_mrr):

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    user_embeddings = BloomEmbedding(interactions.num_users,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    item_embeddings = BloomEmbedding(interactions.num_items,
                                     32,
                                     compression_ratio=compression_ratio,
                                     num_hash_functions=2)
    network = BilinearNet(interactions.num_users,
                          interactions.num_items,
                          user_embedding_layer=user_embeddings,
                          item_embedding_layer=item_embeddings)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       representation=network,
                                       use_cuda=CUDA)

    model.fit(train)
    print(model)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > expected_mrr
예제 #10
0
def data():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    return train, test
예제 #11
0
def load_data(dataset, random_state):

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()
    else:
        dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset,
                                          test_percentage=0.05,
                                          random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
    def obtener_interacciones(self):
        """
        Método obtener_interacciones. Obtiene las interacciones necesarias por los modelos de Spotlight.

        Este método solo se utiliza en la interfaz de texto.
        """
        
        global train, test
        
        # Se obtiene el dataframe de valoraciones
        Entrada.obtener_datos()
        ratings_df = Entrada.ratings_df

        # Se obtienen arrays con los ids de los usuarios y de los ítems
        users_ids = np.asarray(ratings_df[ratings_df.columns.values[0]].tolist(), dtype=np.int32)         
        items_ids = np.asarray(ratings_df[ratings_df.columns.values[1]].tolist(), dtype=np.int32)
        
        # Se transforma el dataframe de valoraciones en interacciones que puedan ser utilzadas por los modelos
        if self.opcion_time == 1:
            timestamps = np.asarray(ratings_df[ratings_df.columns.values[3]].tolist(), dtype=np.int32)
            if self.opcion_modelo == 1:
                ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32)
                interacciones = Interactions(users_ids, items_ids, ratings=ratings, timestamps=timestamps)
                train, test = random_train_test_split(interacciones)
            else:
                interacciones = Interactions(users_ids, items_ids, timestamps=timestamps)
                train, test = random_train_test_split(interacciones)
                if self.opcion_modelo == 3:
                    train = train.to_sequence()
                    test = test.to_sequence()
        else:
            if self.opcion_modelo == 1:
                ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32)
                interacciones = Interactions(users_ids, items_ids, ratings=ratings)
            else:
                interacciones = Interactions(users_ids, items_ids)
            train, test = random_train_test_split(interacciones)
            
        # Se guardan las interacciones de entrenamiento y test
        print("Guarda las interacciones de train")
        guardar_datos_pickle(train, 'las interacciones de entrenamiento')
        print("Guarda las interacciones de test")
        guardar_datos_pickle(test, 'las interacciones de test')
예제 #13
0
def best_params_spotlight(losses,
                          n_iters,
                          batch_sizes,
                          l2s,
                          learning_rates,
                          embedding_dims,
                          train_data,
                          t=Timer()):
    rmses = dict()
    params = dict()
    t.start()
    for loss in losses:
        params['loss'] = loss
        for n_iter in n_iters:
            params['n_iter'] = n_iter
            for batch_size in batch_sizes:
                params['batch_size'] = batch_size
                for l2 in l2s:
                    params['l2'] = l2
                    for learning_rate in learning_rates:
                        params['learning_rate'] = learning_rate
                        for embedding_dim in embedding_dims:
                            params['embedding_dim'] = embedding_dim
                            model = ExplicitFactorizationModel(
                                loss='regression',
                                embedding_dim=
                                embedding_dim,  # latent dimensionality
                                n_iter=n_iter,  # number of epochs of training
                                batch_size=batch_size,  # minibatch size
                                l2=l2,  # strength of L2 regularization
                                learning_rate=learning_rate,
                                use_cuda=torch.cuda.is_available())

                            params['model'] = model

                            train_tr_data, test_tr_data = random_train_test_split(
                                train_data,
                                random_state=np.random.RandomState(42))

                            model.fit(train_tr_data, verbose=True)

                            rmse = rmse_score(model, test_tr_data)

                            rmses[rmse] = params
                            print(
                                "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n"
                                .format(t.stop(), loss, n_iter, l2, batch_size,
                                        learning_rate, embedding_dim, rmse))
                            # restart timer
                            t.start()
예제 #14
0
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2,
                learning_rate, **kwargs):
    model = ExplicitFactorizationModel(
        loss=loss,
        embedding_dim=embedding_dim,  # latent dimensionality
        n_iter=n_iter,  # number of epochs of training
        batch_size=batch_size,  # minibatch size
        l2=l2,  # strength of L2 regularization
        learning_rate=learning_rate,
        use_cuda=torch.cuda.is_available())

    train, test = random_train_test_split(
        data, random_state=np.random.RandomState(42))
    model.fit(train, verbose=True)
    test_rmse = rmse_score(model, test)
    return test_rmse
예제 #15
0
def test_precision_recall(data, k):

    (train, test, model) = data

    interactions = movielens.get_movielens_dataset('100K')
    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    precision, recall = precision_recall_score(model, test, train, k=k)

    assert precision.shape == recall.shape

    if not isinstance(k, list):
        assert len(precision.shape) == 1
    else:
        assert precision.shape[1] == len(k)
예제 #16
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
예제 #17
0
def test_poisson():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='poisson',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse < 1.0
예제 #18
0
def test_adaptive_hinge():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.07
예제 #19
0
def test_adaptive_hinge():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='adaptive_hinge',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.07
예제 #20
0
def data():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=1,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       random_state=RANDOM_STATE,
                                       use_cuda=CUDA)
    model.fit(train)

    return train, test, model
예제 #21
0
def test_bpr():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-2,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr + EPSILON > 0.07
예제 #22
0
def test_to_sequence(max_sequence_length, step_size):

    interactions = movielens.get_movielens_dataset('100K')
    _, interactions = random_train_test_split(interactions)

    sequences = interactions.to_sequence(
        max_sequence_length=max_sequence_length, step_size=step_size)

    if step_size == 1:
        assert sequences.sequences.shape == (len(interactions),
                                             max_sequence_length)
    else:
        assert sequences.sequences.shape[1] == max_sequence_length

    _test_just_padding(sequences.sequences)
    _test_final_column_no_padding(sequences.sequences)
    _test_shifted(sequences.user_ids, sequences.sequences, step_size)
    _test_temporal_order(sequences.user_ids, sequences.sequences, interactions)
예제 #23
0
def test_check_input():
    # Train for single iter.
    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='regression',
                                       n_iter=1,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6)
    model.fit(train)

    # Modify data to make imcompatible with original model.
    train.user_ids[0] = train.user_ids.max() + 1
    with pytest.raises(ValueError):
        model.fit(train)
예제 #24
0
def test_bpr_custom_optimizer():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6):

        return torch.optim.Adagrad(model_params,
                                   lr=lr,
                                   weight_decay=weight_decay)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       optimizer_func=adagrad_optimizer)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.06
예제 #25
0
def test_logistic():

    interactions = movielens.get_movielens_dataset('100K')

    # Convert to binary
    interactions.ratings = (interactions.ratings > 3).astype(np.float32)
    # Convert from (0, 1) to (-1, 1)
    interactions.ratings = interactions.ratings * 2 - 1

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    model = ExplicitFactorizationModel(loss='logistic',
                                       n_iter=10,
                                       batch_size=1024,
                                       learning_rate=1e-3,
                                       l2=1e-6,
                                       use_cuda=CUDA)
    model.fit(train)

    rmse = rmse_score(model, test)

    assert rmse - EPSILON < 1.05
예제 #26
0
def test_to_sequence(max_sequence_length, step_size):

    interactions = movielens.get_movielens_dataset('100K')
    _, interactions = random_train_test_split(interactions)

    sequences = interactions.to_sequence(
        max_sequence_length=max_sequence_length,
        step_size=step_size)

    if step_size == 1:
        assert sequences.sequences.shape == (len(interactions),
                                             max_sequence_length)
    else:
        assert sequences.sequences.shape[1] == max_sequence_length

    _test_just_padding(sequences.sequences)
    _test_final_column_no_padding(sequences.sequences)
    _test_shifted(sequences.user_ids,
                  sequences.sequences,
                  step_size)
    _test_temporal_order(sequences.user_ids,
                         sequences.sequences,
                         interactions)
예제 #27
0
def test_bpr_custom_optimizer():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = random_train_test_split(interactions,
                                          random_state=RANDOM_STATE)

    def adagrad_optimizer(model_params,
                          lr=1e-2,
                          weight_decay=1e-6):

        return torch.optim.Adagrad(model_params,
                                   lr=lr,
                                   weight_decay=weight_decay)

    model = ImplicitFactorizationModel(loss='bpr',
                                       n_iter=10,
                                       batch_size=1024,
                                       optimizer_func=adagrad_optimizer)
    model.fit(train)

    mrr = mrr_score(model, test, train=train).mean()

    assert mrr > 0.06
예제 #28
0
파일: pred.py 프로젝트: 3v1l91l/dss4
    'gender': 'Sender_gender',
    'index': 'Sender_index'
},
                        inplace=True)
finder_decisions = finder_decisions.merge(users,
                                          how='left',
                                          left_on='Receiver_id',
                                          right_index=True)
finder_decisions.rename(columns={
    'age': 'Receiver_age',
    'gender': 'Receiver_gender',
    'index': 'Receiver_index'
},
                        inplace=True)

ratings = np.ones(len(finder_decisions))
ratings[finder_decisions['Decision'] == 'skip'] = -1
ratings = ratings.astype(np.float32)
dataset = Interactions(finder_decisions['Sender_index'].values,
                       finder_decisions['Receiver_index'].values, ratings)

from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(dataset,
                                      random_state=np.random.RandomState(42))

spotlight_model = torch.load('spotlight.model')

predictions = spotlight_model.predict(test.user_ids, test.item_ids)
print((predictions == test.ratings).sum() / len(predictions))
예제 #29
0
#!pip install  git+https://github.com/maciejkula/spotlight.git@master#egg=spotlight


# # movielense data
# - Download the 100k version from https://grouplens.org/datasets/movielens/
# - extract to folder './ml-100k/'

import numpy as np
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
user_ids, item_ids, ratings, timestamps = zip(*[i.strip().split('\t') for i in open("./ml-100k/u.data").readlines()])
user_ids = np.array([int(u) for u in list(user_ids)])
item_ids = np.array([int(i) for i in list(item_ids)])
timestamps = np.array([int(s) for s in list(timestamps)])
interactions = Interactions(user_ids=user_ids, item_ids=item_ids, timestamps=timestamps)
train, test = random_train_test_split(interactions)


# Create random noise


import random
preserving_25_percent_items = []
preserving_50_percent_items = []
preserving_75_percent_items = []
vmin = train.item_ids.min()
vmax = train.item_ids.max()
for real_item_idx in train.item_ids:
    random_item_idx = random.randint(vmin, vmax)
    sampling_threshold = random.random()
    if sampling_threshold < .25:
예제 #30
0
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel

dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

model = ExplicitFactorizationModel(n_iter=1)
model.fit(train)

rmse = rmse_score(model, test)

print(rmse)
예제 #31
0
def trainModelUntilOverfit(dataset, modelSteps, modelIterations,
                           numberDataSplits, embedding_dim, learning_rate):

    numUsers = dataset.num_users
    numMovies = dataset.num_items
    train, test = random_train_test_split(dataset, 0.2)

    print('Split into \n {} and \n {}.'.format(train, test))

    #add random seed
    seed = np.random.RandomState(seed=55555)
    model = ExplicitFactorizationModel(n_iter=modelIterations,
                                       embedding_dim=embedding_dim,
                                       learning_rate=learning_rate,
                                       random_state=seed)

    rmseResults = np.empty((modelSteps * numberDataSplits, 2))
    indexPreviousClosest = ["0"]

    if (numberDataSplits > 1):
        arraySplits = dataSplit(train, numberDataSplits)
        print("Data set split into", len(arraySplits), "*", (arraySplits[1]))
    # Each model step fits the entire dataset
    arrayOfSteps = []
    splitCounter = 0
    fullStepCounter = 0  # increases each time the entire data set has been visited
    currentStep = 0  # increases at every split of the data set (does not reset)
    for i in range(modelSteps * numberDataSplits):
        print("\nStarting step", fullStepCounter)
        print("Data split", splitCounter)
        if (numberDataSplits == 1):
            model.fit(train, verbose=True)
        elif (numberDataSplits > 1):
            print(arraySplits[splitCounter])
            model.fit(arraySplits[splitCounter], verbose=True)

        else:
            print("Invalid number of data splits")
            break

        #predictions for any user are made for all items, matrix has shape (944, 1683)
        modelPredict = np.empty((numUsers, numMovies))
        for userIndex in range(numUsers):
            modelPredict[userIndex, :] = model.predict(userIndex)

        # We take the transpose for tsne formatting (should be more rows than columns)
        modelPredict = modelPredict.T

        #Measure the model's effectiveness (how good predictions are):
        rmse = rmse_score(model, test)
        rmseTrain = rmse_score(model, train)
        rmseTest = rmse_score(model, test)
        print("RMSE TEST:", rmseTest, "\n")
        rmseResults[i, :] = [rmseTrain, rmseTest]
        arrayOfSteps += [i]

        if (stopTraining(rmseResults, arrayOfSteps)):
            rmseResults = rmseResults[:len(arrayOfSteps)]
            break

        if (numberDataSplits > 1):
            splitCounter += 1
            if (splitCounter >= len(arraySplits)):
                splitCounter = 0
                fullStepCounter += 1

    currentStep += 1

    return (model, rmseResults)
df.asin = pd.Categorical(df.asin)
df['ProdKey'] = df.asin.cat.codes

df.dtypes

users = df.UserKey
prods = df.ProdKey         
ratings = df.overall

users1 = users.to_numpy(dtype=int)
prods1 = prods.to_numpy(dtype=int)
ratings1 = ratings.to_numpy(dtype=float)

interaction = Interactions(users1,prods1,ratings1)

train, test = random_train_test_split(interaction, random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))

starttime = datetime.now()
model = ExplicitFactorizationModel(n_iter=1)
model.fit(train, verbose=True)

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

stoptime = datetime.now()
runtime = stoptime - starttime
print('Runtime:{}'.format(runtime))
print('Split into \n training dataset size: {} \n testing dataset size: {}.'.format(train, test))
print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
예제 #33
0
        step_size = max_sequence_length

        train, rest = user_based_train_test_split(dataset,
                                                  test_percentage=0.05,
                                                  random_state=random_state)
        test, validation = user_based_train_test_split(
            rest, test_percentage=0.5, random_state=random_state)
        train = train.to_sequence(max_sequence_length=max_sequence_length,
                                  min_sequence_length=min_sequence_length,
                                  step_size=step_size)
        test = test.to_sequence(max_sequence_length=max_sequence_length,
                                min_sequence_length=min_sequence_length,
                                step_size=step_size)
        validation = validation.to_sequence(
            max_sequence_length=max_sequence_length,
            min_sequence_length=min_sequence_length,
            step_size=step_size)
        print('In test {}, in validation {}'.format(len(test.sequences),
                                                    len(validation.sequences)))
    elif args.model == 'factorization':
        train, rest = random_train_test_split(dataset,
                                              test_percentage=test_percentage,
                                              random_state=random_state)
        test, validation = random_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    experiment_name = '{}_{}'.format(args.dataset, args.model)

    run(experiment_name, train, test, validation, random_state)
예제 #34
0
    elif str(args.data).lower() == 'amazon':
        print('Amazon')
        dataset = get_amazon_dataset()
        split = 0.2
    else:
        print('GoodBook')
        dataset = get_goodbooks_dataset()
        split = 0.2
    rmses = []
    mrrs = []
    rs = np.random.RandomState(100)
    pdb.set_trace()
    for i in range(5):
        print('Split - {} , Run {}'.format(split, i))
        train, test = random_train_test_split(dataset,
                                              random_state=rs,
                                              test_percentage=split)
        if args.model == 'implicit':
            model = ImplicitFactorizationModel(n_iter=args.n_epoch,
                                               loss=args.loss,
                                               use_cuda=True,
                                               learning_rate=args.lr,
                                               representation=args.net)
        elif args.model == 'explicit':
            model = ExplicitFactorizationModel(n_iter=args.n_epoch,
                                               loss=args.loss,
                                               use_cuda=True,
                                               learning_rate=args.lr)
        model.fit(train, verbose=0)

        rmse = rmse_score(model, test)