def load_data(dataset, random_state): dataset = get_movielens_dataset(dataset) train, rest = random_train_test_split(dataset, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) return train, validation, test
def obtener_interacciones_gui(self, ruta_ratings, sep_ratings, encoding_ratings): """ Método obtener_interacciones_gui. Obtiene las interacciones necesarias para la creación de los modelos de Spotlight. Este método solo se utiliza en la interfaz web. Parameters ---------- ruta_ratings: str ruta del archivo que contiene las valoraciones. sep_ratings: str separador utilizado en el archivo de valoraiones. encoding_ratings: str encoding utilizado en el archivo de valoraciones. """ global train, test # Se obtiene el dataframe de valoraciones ratings_df = Entrada.leer_csv(ruta_ratings, sep_ratings, encoding_ratings) ratings_df.sort_values([ratings_df.columns.values[0], ratings_df.columns.values[1]], inplace=True) # Se obtienen arrays con los ids de los usuarios y de los ítems users_ids = np.asarray(ratings_df[ratings_df.columns.values[0]].tolist(), dtype=np.int32) items_ids = np.asarray(ratings_df[ratings_df.columns.values[1]].tolist(), dtype=np.int32) # Se transforma el dataframe de valoraciones en interacciones que puedan ser utilzadas por los modelos if self.opcion_time == 1: timestamps = np.asarray(ratings_df[ratings_df.columns.values[3]].tolist(), dtype=np.int32) if self.opcion_modelo == 1: ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32) interacciones = Interactions(users_ids, items_ids, ratings=ratings, timestamps=timestamps) train, test = random_train_test_split(interacciones) else: interacciones = Interactions(users_ids, items_ids, timestamps=timestamps) train, test = random_train_test_split(interacciones) if self.opcion_modelo == 3: train = train.to_sequence() test = test.to_sequence() else: if self.opcion_modelo == 1: ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32) interacciones = Interactions(users_ids, items_ids, ratings=ratings) else: interacciones = Interactions(users_ids, items_ids) train, test = random_train_test_split(interacciones) # Se guardan las interacciones de entrenamiento y test print("Guarda las interacciones de train") guardar_datos_pickle(train, 'las interacciones de entrenamiento') print("Guarda las interacciones de test") guardar_datos_pickle(test, 'las interacciones de test')
def dataSplit(train, numberDataSplits): arrayOfSplits = [] split1, split2 = random_train_test_split(train, 1.0 / numberDataSplits) arrayOfSplits += [split2] splitLength = len(split2.ratings) while (splitLength < len(split1.ratings)): splitPercentage = splitLength / len(split1.ratings) split1, split2 = random_train_test_split(split1, splitPercentage) arrayOfSplits += [split2] arrayOfSplits += [split1] return arrayOfSplits
def cross_validation(self, interactions: Interactions) -> tuple: """Randomly split interactions between training and testing. This function takes an interaction set and splits it into two disjoint sets, a training set and a test set. Args: interactions (spotlight.interactions.Interactions): Matrix of user-item interactions. Returns: tuple: (spotlight.interactions.Interactions, spotlight.interactions.Interactions), A tuple of (train data, test data). """ def interactions_to_sequence(f_train: Interactions, f_test: Interactions): train, test = f_train.to_sequence(), f_test.to_sequence() return train, test logger = logging.getLogger() train, test = random_train_test_split(interactions) if self._models in ('S_POOL','S_CNN', 'S_LSTM'): train, test = interactions_to_sequence(train, test) logger.info('Split into \n {} and \n {}.'.format(train, test)) return ( train, test )
def get_train_test_set(interaction): """ :param interaction: Our interaction object (input data) :return: Our interaction object (input data) splitted into train and test sets. """ return random_train_test_split(interaction)
def build_mean_baseline_model(data): train_data, test_data = random_train_test_split(data) df = pd.DataFrame(train_data.tocoo().toarray()) print(df.columns) avg_ratings, mean_avg = train(train_data) rmse = predict(test_data, avg_ratings, mean_avg) return rmse
def test_bloom(compression_ratio, expected_rmse): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ExplicitFactorizationModel(loss='regression', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-5, representation=network, use_cuda=CUDA) model.fit(train) print(model) rmse = rmse_score(model, test) print(rmse) assert rmse - EPSILON < expected_rmse
def train_initial_model(): dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) model = ImplicitFactorizationModel(loss='adaptive_hinge', embedding_dim=128, # latent dimensionality n_iter=10, # number of epochs of training batch_size=1024, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, use_cuda=torch.cuda.is_available()) print('Fitting the model') model.fit(train, verbose=True) print(type(model)) model_file = open('models/filmclub.model', 'wb') pickle.dump(model, model_file) model_file.close() dataset.num_users = 1000000 dataset_file = open('data/dataset.pkl', 'wb') pickle.dump(dataset, dataset_file) dataset_file.close() train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
def test_bpr_bloom(compression_ratio, expected_mrr): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, representation=network, use_cuda=CUDA) model.fit(train) print(model) mrr = mrr_score(model, test, train=train).mean() assert mrr > expected_mrr
def data(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) return train, test
def load_data(dataset, random_state): if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() else: dataset = get_movielens_dataset(dataset) train, rest = random_train_test_split(dataset, test_percentage=0.05, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) return train, validation, test
def obtener_interacciones(self): """ Método obtener_interacciones. Obtiene las interacciones necesarias por los modelos de Spotlight. Este método solo se utiliza en la interfaz de texto. """ global train, test # Se obtiene el dataframe de valoraciones Entrada.obtener_datos() ratings_df = Entrada.ratings_df # Se obtienen arrays con los ids de los usuarios y de los ítems users_ids = np.asarray(ratings_df[ratings_df.columns.values[0]].tolist(), dtype=np.int32) items_ids = np.asarray(ratings_df[ratings_df.columns.values[1]].tolist(), dtype=np.int32) # Se transforma el dataframe de valoraciones en interacciones que puedan ser utilzadas por los modelos if self.opcion_time == 1: timestamps = np.asarray(ratings_df[ratings_df.columns.values[3]].tolist(), dtype=np.int32) if self.opcion_modelo == 1: ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32) interacciones = Interactions(users_ids, items_ids, ratings=ratings, timestamps=timestamps) train, test = random_train_test_split(interacciones) else: interacciones = Interactions(users_ids, items_ids, timestamps=timestamps) train, test = random_train_test_split(interacciones) if self.opcion_modelo == 3: train = train.to_sequence() test = test.to_sequence() else: if self.opcion_modelo == 1: ratings = np.asarray(ratings_df[ratings_df.columns.values[2]].tolist(), dtype=np.float32) interacciones = Interactions(users_ids, items_ids, ratings=ratings) else: interacciones = Interactions(users_ids, items_ids) train, test = random_train_test_split(interacciones) # Se guardan las interacciones de entrenamiento y test print("Guarda las interacciones de train") guardar_datos_pickle(train, 'las interacciones de entrenamiento') print("Guarda las interacciones de test") guardar_datos_pickle(test, 'las interacciones de test')
def best_params_spotlight(losses, n_iters, batch_sizes, l2s, learning_rates, embedding_dims, train_data, t=Timer()): rmses = dict() params = dict() t.start() for loss in losses: params['loss'] = loss for n_iter in n_iters: params['n_iter'] = n_iter for batch_size in batch_sizes: params['batch_size'] = batch_size for l2 in l2s: params['l2'] = l2 for learning_rate in learning_rates: params['learning_rate'] = learning_rate for embedding_dim in embedding_dims: params['embedding_dim'] = embedding_dim model = ExplicitFactorizationModel( loss='regression', embedding_dim= embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) params['model'] = model train_tr_data, test_tr_data = random_train_test_split( train_data, random_state=np.random.RandomState(42)) model.fit(train_tr_data, verbose=True) rmse = rmse_score(model, test_tr_data) rmses[rmse] = params print( "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n" .format(t.stop(), loss, n_iter, l2, batch_size, learning_rate, embedding_dim, rmse)) # restart timer t.start()
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2, learning_rate, **kwargs): model = ExplicitFactorizationModel( loss=loss, embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) train, test = random_train_test_split( data, random_state=np.random.RandomState(42)) model.fit(train, verbose=True) test_rmse = rmse_score(model, test) return test_rmse
def test_precision_recall(data, k): (train, test, model) = data interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) precision, recall = precision_recall_score(model, test, train, k=k) assert precision.shape == recall.shape if not isinstance(k, list): assert len(precision.shape) == 1 else: assert precision.shape[1] == len(k)
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def test_adaptive_hinge(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='adaptive_hinge', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.07
def data(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=1, batch_size=1024, learning_rate=1e-2, l2=1e-6, random_state=RANDOM_STATE, use_cuda=CUDA) model.fit(train) return train, test, model
def test_bpr(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-6, use_cuda=CUDA) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr + EPSILON > 0.07
def test_to_sequence(max_sequence_length, step_size): interactions = movielens.get_movielens_dataset('100K') _, interactions = random_train_test_split(interactions) sequences = interactions.to_sequence( max_sequence_length=max_sequence_length, step_size=step_size) if step_size == 1: assert sequences.sequences.shape == (len(interactions), max_sequence_length) else: assert sequences.sequences.shape[1] == max_sequence_length _test_just_padding(sequences.sequences) _test_final_column_no_padding(sequences.sequences) _test_shifted(sequences.user_ids, sequences.sequences, step_size) _test_temporal_order(sequences.user_ids, sequences.sequences, interactions)
def test_check_input(): # Train for single iter. interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='regression', n_iter=1, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) # Modify data to make imcompatible with original model. train.user_ids[0] = train.user_ids.max() + 1 with pytest.raises(ValueError): model.fit(train)
def test_bpr_custom_optimizer(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) def adagrad_optimizer(model_params, lr=1e-2, weight_decay=1e-6): return torch.optim.Adagrad(model_params, lr=lr, weight_decay=weight_decay) model = ImplicitFactorizationModel(loss='bpr', n_iter=10, batch_size=1024, optimizer_func=adagrad_optimizer) model.fit(train) mrr = mrr_score(model, test, train=train).mean() assert mrr > 0.06
def test_logistic(): interactions = movielens.get_movielens_dataset('100K') # Convert to binary interactions.ratings = (interactions.ratings > 3).astype(np.float32) # Convert from (0, 1) to (-1, 1) interactions.ratings = interactions.ratings * 2 - 1 train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='logistic', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6, use_cuda=CUDA) model.fit(train) rmse = rmse_score(model, test) assert rmse - EPSILON < 1.05
'gender': 'Sender_gender', 'index': 'Sender_index' }, inplace=True) finder_decisions = finder_decisions.merge(users, how='left', left_on='Receiver_id', right_index=True) finder_decisions.rename(columns={ 'age': 'Receiver_age', 'gender': 'Receiver_gender', 'index': 'Receiver_index' }, inplace=True) ratings = np.ones(len(finder_decisions)) ratings[finder_decisions['Decision'] == 'skip'] = -1 ratings = ratings.astype(np.float32) dataset = Interactions(finder_decisions['Sender_index'].values, finder_decisions['Receiver_index'].values, ratings) from spotlight.cross_validation import random_train_test_split train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) spotlight_model = torch.load('spotlight.model') predictions = spotlight_model.predict(test.user_ids, test.item_ids) print((predictions == test.ratings).sum() / len(predictions))
#!pip install git+https://github.com/maciejkula/spotlight.git@master#egg=spotlight # # movielense data # - Download the 100k version from https://grouplens.org/datasets/movielens/ # - extract to folder './ml-100k/' import numpy as np from spotlight.interactions import Interactions from spotlight.cross_validation import random_train_test_split user_ids, item_ids, ratings, timestamps = zip(*[i.strip().split('\t') for i in open("./ml-100k/u.data").readlines()]) user_ids = np.array([int(u) for u in list(user_ids)]) item_ids = np.array([int(i) for i in list(item_ids)]) timestamps = np.array([int(s) for s in list(timestamps)]) interactions = Interactions(user_ids=user_ids, item_ids=item_ids, timestamps=timestamps) train, test = random_train_test_split(interactions) # Create random noise import random preserving_25_percent_items = [] preserving_50_percent_items = [] preserving_75_percent_items = [] vmin = train.item_ids.min() vmax = train.item_ids.max() for real_item_idx in train.item_ids: random_item_idx = random.randint(vmin, vmax) sampling_threshold = random.random() if sampling_threshold < .25:
from spotlight.cross_validation import random_train_test_split from spotlight.datasets.movielens import get_movielens_dataset from spotlight.evaluation import rmse_score from spotlight.factorization.explicit import ExplicitFactorizationModel dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset) model = ExplicitFactorizationModel(n_iter=1) model.fit(train) rmse = rmse_score(model, test) print(rmse)
def trainModelUntilOverfit(dataset, modelSteps, modelIterations, numberDataSplits, embedding_dim, learning_rate): numUsers = dataset.num_users numMovies = dataset.num_items train, test = random_train_test_split(dataset, 0.2) print('Split into \n {} and \n {}.'.format(train, test)) #add random seed seed = np.random.RandomState(seed=55555) model = ExplicitFactorizationModel(n_iter=modelIterations, embedding_dim=embedding_dim, learning_rate=learning_rate, random_state=seed) rmseResults = np.empty((modelSteps * numberDataSplits, 2)) indexPreviousClosest = ["0"] if (numberDataSplits > 1): arraySplits = dataSplit(train, numberDataSplits) print("Data set split into", len(arraySplits), "*", (arraySplits[1])) # Each model step fits the entire dataset arrayOfSteps = [] splitCounter = 0 fullStepCounter = 0 # increases each time the entire data set has been visited currentStep = 0 # increases at every split of the data set (does not reset) for i in range(modelSteps * numberDataSplits): print("\nStarting step", fullStepCounter) print("Data split", splitCounter) if (numberDataSplits == 1): model.fit(train, verbose=True) elif (numberDataSplits > 1): print(arraySplits[splitCounter]) model.fit(arraySplits[splitCounter], verbose=True) else: print("Invalid number of data splits") break #predictions for any user are made for all items, matrix has shape (944, 1683) modelPredict = np.empty((numUsers, numMovies)) for userIndex in range(numUsers): modelPredict[userIndex, :] = model.predict(userIndex) # We take the transpose for tsne formatting (should be more rows than columns) modelPredict = modelPredict.T #Measure the model's effectiveness (how good predictions are): rmse = rmse_score(model, test) rmseTrain = rmse_score(model, train) rmseTest = rmse_score(model, test) print("RMSE TEST:", rmseTest, "\n") rmseResults[i, :] = [rmseTrain, rmseTest] arrayOfSteps += [i] if (stopTraining(rmseResults, arrayOfSteps)): rmseResults = rmseResults[:len(arrayOfSteps)] break if (numberDataSplits > 1): splitCounter += 1 if (splitCounter >= len(arraySplits)): splitCounter = 0 fullStepCounter += 1 currentStep += 1 return (model, rmseResults)
df.asin = pd.Categorical(df.asin) df['ProdKey'] = df.asin.cat.codes df.dtypes users = df.UserKey prods = df.ProdKey ratings = df.overall users1 = users.to_numpy(dtype=int) prods1 = prods.to_numpy(dtype=int) ratings1 = ratings.to_numpy(dtype=float) interaction = Interactions(users1,prods1,ratings1) train, test = random_train_test_split(interaction, random_state=np.random.RandomState(42)) print('Split into \n {} and \n {}.'.format(train, test)) starttime = datetime.now() model = ExplicitFactorizationModel(n_iter=1) model.fit(train, verbose=True) train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) stoptime = datetime.now() runtime = stoptime - starttime print('Runtime:{}'.format(runtime)) print('Split into \n training dataset size: {} \n testing dataset size: {}.'.format(train, test)) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))
step_size = max_sequence_length train, rest = user_based_train_test_split(dataset, test_percentage=0.05, random_state=random_state) test, validation = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('In test {}, in validation {}'.format(len(test.sequences), len(validation.sequences))) elif args.model == 'factorization': train, rest = random_train_test_split(dataset, test_percentage=test_percentage, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) experiment_name = '{}_{}'.format(args.dataset, args.model) run(experiment_name, train, test, validation, random_state)
elif str(args.data).lower() == 'amazon': print('Amazon') dataset = get_amazon_dataset() split = 0.2 else: print('GoodBook') dataset = get_goodbooks_dataset() split = 0.2 rmses = [] mrrs = [] rs = np.random.RandomState(100) pdb.set_trace() for i in range(5): print('Split - {} , Run {}'.format(split, i)) train, test = random_train_test_split(dataset, random_state=rs, test_percentage=split) if args.model == 'implicit': model = ImplicitFactorizationModel(n_iter=args.n_epoch, loss=args.loss, use_cuda=True, learning_rate=args.lr, representation=args.net) elif args.model == 'explicit': model = ExplicitFactorizationModel(n_iter=args.n_epoch, loss=args.loss, use_cuda=True, learning_rate=args.lr) model.fit(train, verbose=0) rmse = rmse_score(model, test)