def __init__(self, loss='regression', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, representation=None, sparse=False, random_state=None): assert loss in ('regression', 'poisson', 'logistic') self._loss = loss self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._representation = representation self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_users = None self._num_items = None self._net = None self._optimizer = None self._loss_func = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def main(args): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) args = parse_args(args) # Fix random_state seed = 72 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if args.dataset == 'amazon': max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length dataset = get_amazon_dataset() elif args.dataset == 'goodbooks': dataset = get_goodbooks_dataset() else: dataset = get_movielens_dataset(args.dataset.upper()) args.variant = args.dataset train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('model: {}, data: {}'.format(args.model, train)) fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space(args.model) for iteration in range(args.num_trials): print('Iteration {}'.format(iteration)) trials = optimize(objective, space, trials_fname=fname, max_evals=iteration + 1) summarize_trials(trials)
def __init__(self, loss='pointwise', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, sparse=False, random_state=None): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge') self._loss = loss self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_users = None self._num_items = None self._net = None self._optimizer = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def build_sequence_model(hyperparameters, train, random_state): h = hyperparameters set_seed(42, CUDA) if h['compression_ratio'] < 1.0: item_embeddings = BloomEmbedding( train.num_items, h['embedding_dim'], compression_ratio=h['compression_ratio'], num_hash_functions=4, padding_idx=0) else: item_embeddings = ScaledEmbedding(train.num_items, h['embedding_dim'], padding_idx=0) network = LSTMNet(train.num_items, h['embedding_dim'], item_embedding_layer=item_embeddings) model = ImplicitSequenceModel(loss=h['loss'], n_iter=h['n_iter'], batch_size=h['batch_size'], learning_rate=h['learning_rate'], embedding_dim=h['embedding_dim'], l2=h['l2'], representation=network, use_cuda=CUDA, random_state=np.random.RandomState(42)) return model
def __init__(self, loss='regression', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, sparse=False, random_state=None): assert loss in ('regression', 'poisson') self._loss = loss self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_users = None self._num_items = None self._net = None self._optimizer = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def __init__(self, loss='bce', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, representation=None, sparse=False, random_state=None, num_negative_samples=5, k_sample=-1, inputSample = -1): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge','bce') self._loss = loss self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._representation = representation self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_negative_samples = num_negative_samples self._num_users = None self._num_items = None self._net = None self._optimizer = None self._loss_func = None self.k_sample = k_sample self.inputSample = inputSample tmp = [] for ii1 in range(self._batch_size): for ii2 in range(ii1+1, self._batch_size): tmp.append((ii1, ii2)) self.combPos = torch.zeros((len(tmp), 2)) for i in range(len(tmp)): self.combPos[i][0] = tmp[i][0] self.combPos[i][1] = tmp[i][1] set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def __init__(self, loss='pointwise', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, representation=None, sparse=False, random_state=None, num_negative_samples=5, betas=(0.9, 0.999), log_loss_interval=500, log_eval_interval=5000, notify_loss_completion=None, notify_batch_eval_completion=None, notify_epoch_completion=None, amsgrad=False, adamw=None): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge', 'adaptive_bpr') self._loss = loss self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._representation = representation self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_negative_samples = num_negative_samples self._num_users = None self._num_items = None self._net = None self._optimizer = None self._loss_func = None self._log_eval_interval = log_eval_interval self._betas = betas self._log_loss_interval = log_loss_interval set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda) self._notify_loss_completion = notify_loss_completion self._notify_batch_eval_completion = notify_batch_eval_completion self._notify_epoch_completion = notify_epoch_completion self._adamw = adamw self._amsgrad = amsgrad
def __init__(self, loss='pointwise', representation='pooling', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, sparse=False, random_state=None, num_negative_samples=5): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge') if isinstance(representation, str): assert representation in ('pooling', 'cnn', 'lstm') self._loss = loss self._representation = representation self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_negative_samples = num_negative_samples self._num_items = None self._net = None self._optimizer = None self._loss_func = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def __init__(self, loss='pointwise', embedding_dim=32, memory_dim=10, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, representation=None, sparse=False, random_state=None, num_negative_samples=5, margin=None, cov_reg=None): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge', 'warp') self._loss = loss self._embedding_dim = embedding_dim self._memory_dim = memory_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._representation = representation self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_negative_samples = num_negative_samples self._margin = margin self._cov_reg = cov_reg self._num_users = None self._num_items = None self._net = None self._optimizer = None self._loss_func = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def __init__(self, loss='pointwise', representation='pooling', embedding_dim=32, n_iter=10, batch_size=256, l2=0.0, learning_rate=1e-2, optimizer_func=None, use_cuda=False, sparse=False, random_state=None): assert loss in ('pointwise', 'bpr', 'hinge', 'adaptive_hinge') if isinstance(representation, str): assert representation in ('pooling', 'cnn', 'lstm') self._loss = loss self._representation = representation self._embedding_dim = embedding_dim self._n_iter = n_iter self._learning_rate = learning_rate self._batch_size = batch_size self._l2 = l2 self._use_cuda = use_cuda self._sparse = sparse self._optimizer_func = optimizer_func self._random_state = random_state or np.random.RandomState() self._num_items = None self._net = None self._optimizer = None set_seed(self._random_state.randint(-10**8, 10**8), cuda=self._use_cuda)
def train_model(df, hyperparams): # Fix random_state seed = 42 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 15 min_sequence_length = 2 step_size = 1 # create dataset using interactions dataframe and timestamps dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'), item_ids=np.array(df['item_id'], dtype='int32'), timestamps=df['entry_at']) # create training and test sets using a 80/20 split train, test = user_based_train_test_split(dataset, test_percentage=0.2, random_state=random_state) # convert to sequences train = train.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('data: {}'.format(train)) # initialize and train model model = ImplicitSequenceModel(**hyperparams, use_cuda=CUDA, random_state=random_state) model.fit(train, verbose=True) # compute mrr score on test set test_mrr = sequence_mrr_score(model, test).mean() print('MRR score on test set: {}'.format(test_mrr)) return model
def main(max_evals): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) # Fix random_state seed = 42 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 15 min_sequence_length = 2 step_size = 1 df = pd.read_csv(FILE_PATH) if 'time_of_day' in df.columns: df = df.drop(columns=['time_of_day', 'time_of_year', 'is_content_block']) if 'Unnamed: 0' in df.columns: df = df.drop(columns=['Unnamed: 0', 'js_key']) sub_col = 'subscriber_id' block_col = 'ddi_id' time_col = 'entry_at' # preprocess dataframe df[time_col] = pd.to_datetime(df[time_col]) df.sort_values(by=time_col, inplace=True) df.reset_index(inplace=True) df.drop(columns='index', inplace=True) # create idx mapping compatible with spotlight, map users and items sub_mapping = {k:v for v, k in enumerate(df[sub_col].unique())} block_mapping = {k:v for v, k in enumerate(df[block_col].unique(), 1)} df['user_id'] = df[sub_col].map(sub_mapping) df['item_id'] = df[block_col].map(block_mapping) # create dataset using interactions and timestamps dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'), item_ids=np.array(df['item_id'], dtype='int32'), timestamps=df[time_col]) # create training, validation and test sets using a 80/10/10 split train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) # convert to sequences train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('data: {}'.format(train)) dtime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") fname = './experiment_{}.pickle'.format(dtime) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space() trials = optimize(objective, space, trials_fname=fname, max_evals=max_evals) summarize_trials(trials) return trials