class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def train(self, n_epochs=10, save_dir='./models', model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Model Name:{model_name}') # Time the training process start_time = time.time() for epoch in range(n_epochs): loss = self.run_epoch() end_time = time.time() wall_clock = (end_time - start_time) / 60 print( f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)' ) start_time = time.time() # Store the intermediate model save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir / model_fname) def run_epoch(self): """ Run a single training epoch """ # initialize mb_losses = [] optimizer = self.optimizer hidden = self.gru.init_hidden().data # Start the training loop loader = SessionDataLoader(df=self.df_train, hidden=hidden, session_key=self.session_key, item_key=self.item_key, time_key=self.time_key, batch_size=self.batch_size, training=self.gru.training, time_sort=self.time_sort) for input, target, hidden in loader.generate_batch(): if self.use_cuda: input = input.cuda() target = target.cuda() # Embed the input embedded = self.gru.emb(input) # Go through the GRU layer logit, hidden = self.gru(embedded, target, hidden) ######################## IMPORTANT ######################### # update the hidden state for the dataloader from the outside ############################################################# loader.update_hidden(hidden.data) # Calculate the mini-batch loss mb_loss = self.loss_fn(logit) mb_losses.append(mb_loss.data[0]) # flush the gradient b/f backprop optimizer.zero_grad() # Backprop mb_loss.backward() # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD optimizer.step() avg_epoch_loss = np.mean(mb_losses) return avg_epoch_loss def test(self, k=20, batch_size=50): """ Model evaluation Args: k (int): the length of the recommendation list batch_size (int): testing batch_size Returns: avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches """ # set the gru layer into inference mode if self.gru.training: self.gru.switch_mode() recalls = [] mrrs = [] hidden = self.gru.init_hidden().data # Start the testing loop loader = SessionDataLoader(df=self.df_test, hidden=hidden, session_key=self.session_key, item_key=self.item_key, time_key=self.time_key, batch_size=batch_size, training=self.gru.training, time_sort=self.time_sort) for input, target, hidden in loader.generate_batch(): if self.use_cuda: input = input.cuda() target = target.cuda() # Embed the input embedded = self.gru.emb(input, volatile=True) # forward propagation logit, hidden = self.gru(embedded, target, hidden) # update the hidden state for the dataloader loader.update_hidden(hidden.data) # Evaluate the results recall, mrr = E.evaluate(logit, target, k) recalls.append(recall) mrrs.append(mrr) avg_recall = np.mean(recalls) avg_mrr = np.mean(mrrs) # reset the gru to a training mode self.gru.switch_mode() return avg_recall, avg_mrr def init_data(self, df_train, df_test, session_key, time_key, item_key): """ Initialize the training & test data. The training/test set, session/time/item keys will be stored for later reuse. Args: df_train (pd.DataFrame): training set required to retrieve the training item indices. df_test (pd.DataFrame): test set session_key (str): session ID time_key (str): time ID item_key (str): item ID """ # Specify the identifiers self.session_key = session_key self.time_key = time_key self.item_key = item_key # Initialize the dataframes into adequate forms self.df_train = self.init_df(df_train, session_key, time_key, item_key) self.df_test = self.init_df(df_test, session_key, time_key, item_key, item_ids=df_train[item_key].unique()) @staticmethod def init_df(df, session_key, time_key, item_key, item_ids=None): """ Initialize the dataframe. Involves the following steps: 1) Add new item indices to the dataframe 2) Sort the df Args: session_key: session identifier time_key: timestamp item_key: item identifier item_ids: unique item ids. Should be `None` if the df is a training set, and should include the ids for the items included in the training set if the df is a test set. """ # add item index column named "item_idx" to the df if item_ids is None: item_ids = df[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids) df = pd.merge(df, pd.DataFrame({ item_key: item_ids, 'item_idx': item2idx[item_ids].values }), on=item_key, how='inner') """ Sort the df by time, and then by session ID. That is, df is sorted by session ID and clicks within a session are next to each other, where the clicks within a session are time-ordered. """ df.sort_values([session_key, time_key], inplace=True) return df
class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.05, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def train(self, df, session_key, time_key, item_key, n_epochs=10, save_dir='./models', model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: df (pd.DataFrame): training dataset session_key (str): session ID time_key (str): time ID item_key (str): item ID n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ df, click_offsets, session_idx_arr = GRU4REC.init_data(df, session_key, time_key, item_key, time_sort=self.time_sort) # Time the training process start_time = time.time() for epoch in range(n_epochs): loss = self.run_epoch(df, click_offsets, session_idx_arr) end_time = time.time() wall_clock = (end_time - start_time) / 60 print(f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)') start_time = time.time() # Store the intermediate model save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir/model_fname) def run_epoch(self, df, click_offsets, session_idx_arr): """ Runs a single training epoch """ mb_losses = [] # initializations iters = np.arange(self.batch_size) maxiter = iters.max() start = click_offsets[session_idx_arr[iters]] end = click_offsets[session_idx_arr[iters]+1] # initialize the hidden state hidden = self.gru.init_hidden().data optimizer = self.optimizer # Start the training loop finished = False while not finished: minlen = (end-start).min() # Item indices(for embedding) for clicks where the first sessions start idx_target = df.item_idx.values[start] for i in range(minlen - 1): # Build inputs, targets, and hidden states idx_input = idx_target idx_target = df.item_idx.values[start + i + 1] input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor target = Variable(torch.LongTensor(idx_target)) #(B) if self.use_cuda: input = input.cuda() target = target.cuda() # Now, convert into an embedded Variable embedded = self.gru.emb(input) hidden = Variable(hidden) # Go through the GRU layer logit, hidden = self.gru(embedded, target, hidden) # Calculate the mini-batch loss mb_loss = self.loss_fn(logit) mb_losses.append(mb_loss.data[0]) # flush the gradient b/f backprop optimizer.zero_grad() # Backprop mb_loss.backward() # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD optimizer.step() # Detach the hidden state for later reuse hidden = hidden.data # click indices where a particular session meets second-to-last element start = start + (minlen - 1) # figure out how many sessions should terminate mask = np.arange(len(iters))[(end-start)<=1] for idx in mask: maxiter += 1 if maxiter >= len(click_offsets)-1: finished = True break # update the next starting/ending point iters[idx] = maxiter start[idx] = click_offsets[session_idx_arr[maxiter]] end[idx] = click_offsets[session_idx_arr[maxiter]+1] # reset the rnn hidden state to zero after transition if len(mask) != 0: hidden[:, mask, :] = 0 avg_epoch_loss = np.mean(mb_losses) return avg_epoch_loss def predict(self, input, target, hidden): """ Forward propagation for testing Args: input (B,C): torch.LongTensor. The one-hot embedding for the item indices target (B,): a Variable that stores the indices for the next items hidden: previous hidden state Returns: logits (B,C): logits for the next items hidden: next hidden state """ # convert the item indices into embeddings embedded = self.gru.emb(input, volatile=True) hidden = Variable(hidden, volatile=True) # forward propagation logits, hidden = self.gru(embedded, target, hidden) return logits, hidden def test(self, df_train, df_test, session_key, time_key, item_key, k=20, batch_size=50): """ Model evaluation Args: df_train (pd.DataFrame): training set required to retrieve the training item indices. df_test (pd.DataFrame): test set session_key (str): session ID time_key (str): time ID item_key (str): item ID k (int): the length of the recommendation list batch_size (int): testing batch_size Returns: avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches """ # set the gru layer into inference mode if self.gru.training: self.gru.switch_mode() recalls = [] mrrs = [] # initializations # Build item2idx from train data. iids = df_train[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(iids)), index=iids) df_test = pd.merge(df_test, pd.DataFrame({item_key: iids, 'item_idx': item2idx[iids].values}), on=item_key, how='inner') # Sort the df by time, and then by session ID. df_test.sort_values([session_key, time_key], inplace=True) # Return the offsets of the beginning clicks of each session IDs click_offsets = GRU4REC.get_click_offsets(df_test, session_key) session_idx_arr = GRU4REC.order_session_idx(df_test, session_key, time_key, time_sort=self.time_sort) iters = np.arange(batch_size) maxiter = iters.max() start = click_offsets[session_idx_arr[iters]] end = click_offsets[session_idx_arr[iters]+1] hidden = self.gru.init_hidden().data # Start the training loop finished = False while not finished: minlen = (end-start).min() # Item indices(for embedding) for clicks where the first sessions start idx_target = df_test.item_idx.values[start] for i in range(minlen - 1): # Build inputs, targets, and hidden states idx_input = idx_target idx_target = df_test.item_idx.values[start + i + 1] input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor target = Variable(torch.LongTensor(idx_target), volatile=True) # (B) if self.use_cuda: input = input.cuda() target = target.cuda() logit, hidden = self.predict(input, target, hidden) recall, mrr = evaluate(logit, target, k) recalls.append(recall) mrrs.append(mrr) # Detach the hidden state for later reuse hidden = hidden.data # click indices where a particular session meets second-to-last element start = start + (minlen - 1) # see if how many sessions should terminate mask = np.arange(len(iters))[(end-start)<=1] for idx in mask: maxiter += 1 if maxiter >= len(click_offsets)-1: finished = True break # update the next starting/ending point iters[idx] = maxiter start[idx] = click_offsets[session_idx_arr[maxiter]] end[idx] = click_offsets[session_idx_arr[maxiter]+1] # reset the rnn hidden state to zero after transition if len(mask)!= 0: hidden[:,mask,:] = 0 avg_recall = np.mean(recalls) avg_mrr = np.mean(mrrs) # reset the gru to a training mode self.gru.switch_mode() return avg_recall, avg_mrr @staticmethod def init_data(df, session_key, time_key, item_key, time_sort): """ Initialize the data. """ # add item indices to the dataframe df = GRU4REC.add_item_indices(df, item_key) """ Sort the df by time, and then by session ID. That is, df is sorted by session ID and clicks within a session are next to each other, where the clicks within a session are time-ordered. """ df.sort_values([session_key, time_key], inplace=True) click_offsets = GRU4REC.get_click_offsets(df, session_key) session_idx_arr = GRU4REC.order_session_idx(df, session_key, time_key, time_sort=time_sort) return df, click_offsets, session_idx_arr @staticmethod def add_item_indices(df, item_key): """ Adds an item index column named "item_idx" to the df. Args: df: pd.DataFrame to add the item indices to Returns: df: copy of the original df with item indices """ iids = df[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(iids)), index=iids) df = pd.merge(df, pd.DataFrame({item_key: iids, 'item_idx': item2idx[iids].values}), on=item_key, how='inner') return df @staticmethod def get_click_offsets(df, session_key): """ Return the offsets of the beginning clicks of each session IDs, where the offset is calculated against the first click of the first session ID. """ offsets = np.zeros(df[session_key].nunique()+1, dtype=np.int32) # group & sort the df by session_key and get the offset values offsets[1:] = df.groupby(session_key).size().cumsum() return offsets @staticmethod def order_session_idx(df, session_key, time_key, time_sort=False): """ Order the session indices """ if time_sort: # starting time for each sessions, sorted by session IDs sessions_start_time = df.groupby(session_key)[time_key].min().values # order the session indices by session starting times session_idx_arr = np.argsort(sessions_start_time) else: session_idx_arr = np.arange(df[session_key].nunique()) return session_idx_arr