def load_data(dataset, random_state): dataset = get_movielens_dataset(dataset) # np.random.shuffle(dataset.timestamps) # max_sequence_length = int(np.percentile(dataset.tocsr() # .getnnz(axis=1), # 80)) max_sequence_length = 100 min_sequence_length = 50 step_size = max_sequence_length train_nonsequence, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train_nonsequence.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) return train_nonsequence, train, validation, test
def main(args): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) args = parse_args(args) # Fix random_state seed = 72 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if args.dataset == 'amazon': max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length dataset = get_amazon_dataset() elif args.dataset == 'goodbooks': dataset = get_goodbooks_dataset() else: dataset = get_movielens_dataset(args.dataset.upper()) args.variant = args.dataset train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('model: {}, data: {}'.format(args.model, train)) fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space(args.model) for iteration in range(args.num_trials): print('Iteration {}'.format(iteration)) trials = optimize(objective, space, trials_fname=fname, max_evals=iteration + 1) summarize_trials(trials)
def run(model_type=None): random_state = mrecsys.sequence.__random_state__ if model_type is None: model_type = input('Enter model type (cnn / lstm / pooling): ') print('CUDA:', CUDA) interactions, time_code, _, _ = load_latest_interactions() train, rest = user_based_train_test_split(interactions, random_state=random_state) test, validation = user_based_train_test_split(rest, random_state=random_state) print('Split into \n {} and \n {} and \n {}.'.format( train, test, validation)) tuning(train, test, validation, random_state, model_type, time_code)
def _get_synthetic_data(num_users=100, num_items=100, num_interactions=10000, randomness=0.01, order=2, random_state=None): interactions = synthetic.generate_sequential( num_users=num_users, num_items=num_items, num_interactions=num_interactions, concentration_parameter=randomness, order=order, random_state=random_state) print('Max prob {}'.format( (np.unique(interactions.item_ids, return_counts=True)[1] / num_interactions).max())) train, test = user_based_train_test_split(interactions, random_state=random_state) train = train.to_sequence(max_sequence_length=10) test = test.to_sequence(max_sequence_length=10) return train, test
def data_implicit_sequence(): max_sequence_length = 200 min_sequence_length = 20 step_size = 200 interactions = movielens.get_movielens_dataset('100K') train, test = user_based_train_test_split(interactions, random_state=RANDOM_STATE) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) model = ImplicitSequenceModel(loss='adaptive_hinge', representation='lstm', batch_size=8, learning_rate=1e-2, l2=1e-3, n_iter=2, use_cuda=CUDA, random_state=RANDOM_STATE) model.fit(train, verbose=True) return train, test, model
def _get_synthetic_data(num_users=100, num_items=100, num_interactions=10000, randomness=0.01, order=2, max_sequence_length=10, random_state=None): interactions = synthetic.generate_sequential(num_users=num_users, num_items=num_items, num_interactions=num_interactions, concentration_parameter=randomness, order=order, random_state=random_state) print('Max prob {}'.format((np.unique(interactions.item_ids, return_counts=True)[1] / num_interactions).max())) train, test = user_based_train_test_split(interactions, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, step_size=None) test = test.to_sequence(max_sequence_length=max_sequence_length, step_size=None) return train, test
def test_user_based_split(): interactions = movielens.get_movielens_dataset('100K') train, test = (cross_validation.user_based_train_test_split( interactions, test_percentage=0.2, random_state=RANDOM_STATE)) assert len(train) + len(test) == len(interactions) users_in_test = len(np.unique(test.user_ids)) assert np.allclose(float(users_in_test) / interactions.num_users, 0.2, atol=0.001)
def load_data(dataset, random_state): max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() # This is a dataset with shorter sequences max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length else: dataset = get_movielens_dataset(dataset) train_nonsequence, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train_nonsequence.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) return train_nonsequence, train, validation, test
def preprocess_generated(num_users = 100, num_items = 1000, num_interactions = 10000): from spotlight.datasets.synthetic import generate_sequential from spotlight.cross_validation import user_based_train_test_split dataset = generate_sequential(num_users=num_users, num_items=num_items, num_interactions=num_interactions, concentration_parameter=0.0001, order=3) dat = {key: dat for key, dat in zip(["train","test"], user_based_train_test_split(dataset))} dat_seq = {key : val.to_sequence() for key, val in dat.items()} ind2val = {} ind2val['itemId'] = {idx : item for item, idx in enumerate(range(dataset.item_ids.max()))} return dat, dat_seq, ind2val
def train_model(df, hyperparams): # Fix random_state seed = 42 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 15 min_sequence_length = 2 step_size = 1 # create dataset using interactions dataframe and timestamps dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'), item_ids=np.array(df['item_id'], dtype='int32'), timestamps=df['entry_at']) # create training and test sets using a 80/20 split train, test = user_based_train_test_split(dataset, test_percentage=0.2, random_state=random_state) # convert to sequences train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('data: {}'.format(train)) # initialize and train model model = ImplicitSequenceModel(**hyperparams, use_cuda=CUDA, random_state=random_state) model.fit(train, verbose=True) # compute mrr score on test set test_mrr = sequence_mrr_score(model, test).mean() print('MRR score on test set: {}'.format(test_mrr)) return model
df_out['timestamp'] = df_data[['timestamp']] df_out['step'] = df_data[['step']] df_out['item_recommendations'] = item_recommendations return df_out print("Load Data") train_csv = abspath("../../../resources/train_small_no_header.csv") test_csv = abspath("../../../resources/test.csv") subm_csv = abspath("../../../resources/myoutput.csv") print(f"Reading {train_csv} ...") df_train = pd.read_csv(train_csv) train, test = user_based_train_test_split(train_csv) train = train.to_sequence() test = test.to_sequence() #print(f"Reading {test_csv} ...") #df_test = pd.read_csv(test_csv) print("Build and Fit Implicit Sequence Model") model = ImplicitSequenceModel(n_iter=3, representation='cnn', loss='bpr') #model.fit(df_train) model.fit(train) print("Calculate MRR Score") mrr = sequence_mrr_score(model, test_csv) print("MRR Result: ", mrr)
import numpy as np from spotlight.cross_validation import user_based_train_test_split from spotlight.evaluation import sequence_mrr_score from spotlight.sequence.implicit import ImplicitSequenceModel from spotlight.datasets.synthetic import generate_sequential dataset = generate_sequential(num_users=100, num_items=1000, num_interactions=10000, concentration_parameter=0.01, order=3) train, test = user_based_train_test_split(dataset) train = train.to_sequence() test = test.to_sequence() model = ImplicitSequenceModel(n_iter=3, representation='cnn', loss='bpr') model.fit(train) mrr = sequence_mrr_score(model, test)
results.save(hyperparameters, test_mrr.mean(), val_mrr.mean()) return results if __name__ == '__main__': max_sequence_length = 200 min_sequence_length = 20 step_size = 200 random_state = np.random.RandomState(100) dataset = get_movielens_dataset('1M') train, rest = user_based_train_test_split(dataset, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) mode = sys.argv[1]
def main(max_evals): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) # Fix random_state seed = 42 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 15 min_sequence_length = 2 step_size = 1 df = pd.read_csv(FILE_PATH) if 'time_of_day' in df.columns: df = df.drop(columns=['time_of_day', 'time_of_year', 'is_content_block']) if 'Unnamed: 0' in df.columns: df = df.drop(columns=['Unnamed: 0', 'js_key']) sub_col = 'subscriber_id' block_col = 'ddi_id' time_col = 'entry_at' # preprocess dataframe df[time_col] = pd.to_datetime(df[time_col]) df.sort_values(by=time_col, inplace=True) df.reset_index(inplace=True) df.drop(columns='index', inplace=True) # create idx mapping compatible with spotlight, map users and items sub_mapping = {k:v for v, k in enumerate(df[sub_col].unique())} block_mapping = {k:v for v, k in enumerate(df[block_col].unique(), 1)} df['user_id'] = df[sub_col].map(sub_mapping) df['item_id'] = df[block_col].map(block_mapping) # create dataset using interactions and timestamps dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'), item_ids=np.array(df['item_id'], dtype='int32'), timestamps=df[time_col]) # create training, validation and test sets using a 80/10/10 split train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) # convert to sequences train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('data: {}'.format(train)) dtime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") fname = './experiment_{}.pickle'.format(dtime) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space() trials = optimize(objective, space, trials_fname=fname, max_evals=max_evals) summarize_trials(trials) return trials
results.save(hyperparameters, test_mrr.mean(), val_mrr.mean()) return results if __name__ == '__main__': max_sequence_length = 200 min_sequence_length = 20 step_size = 200 random_state = np.random.RandomState(100) dataset = get_movielens_dataset('1M') train, rest = user_based_train_test_split(dataset, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) mode = sys.argv[1]