def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort
def main(): parser = argparse.ArgumentParser() # Model filename parser.add_argument('model_file', type=str) # Size of the recommendation list parser.add_argument('--k', default=20, type=int) # parse the nn arguments parser.add_argument('--hidden_size', default=100, type=int) parser.add_argument('--num_layers', default=1, type=int) parser.add_argument('--batch_size', default=50, type=int) parser.add_argument('--dropout_input', default=0, type=float) parser.add_argument('--dropout_hidden', default=.5, type=float) # parse the optimizer arguments parser.add_argument('--optimizer_type', default='Adagrad', type=str) parser.add_argument('--lr', default=.01, type=float) parser.add_argument('--weight_decay', default=0, type=float) parser.add_argument('--momentum', default=0, type=float) parser.add_argument('--eps', default=1e-6, type=float) # parse the loss type parser.add_argument('--loss_type', default='TOP1', type=str) # etc parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--time_sort', default=False, type=bool) parser.add_argument('--n_samples', default=-1, type=int) # Get the arguments args = parser.parse_args() PATH_DATA = Path('./data') PATH_MODEL = Path('./models') train = 'train.tsv' test = 'test.tsv' PATH_TRAIN = PATH_DATA / train PATH_TEST = PATH_DATA / test df_train = pd.read_csv(PATH_TRAIN, sep='\t', names=['SessionId', 'ItemId', 'TimeStamp']) df_test = pd.read_csv(PATH_TEST, sep='\t', names=['SessionId', 'ItemId', 'TimeStamp']) # sampling, if needed n_samples = args.n_samples if n_samples != -1: df_train = df_train[:n_samples] df_test = df_test[:n_samples] session_key = 'SessionId' item_key = 'ItemId' time_key = 'TimeStamp' use_cuda = True input_size = df_train[item_key].nunique() hidden_size = args.hidden_size num_layers = args.num_layers output_size = input_size batch_size = args.batch_size dropout_input = args.dropout_input dropout_hidden = args.dropout_hidden loss_type = args.loss_type optimizer_type = args.optimizer_type lr = args.lr weight_decay = args.weight_decay momentum = args.momentum eps = args.eps n_epochs = args.n_epochs time_sort = args.time_sort MODEL_FILE = PATH_MODEL / args.model_file gru = GRU(input_size, hidden_size, output_size, num_layers=num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda) gru.load_state_dict(torch.load(MODEL_FILE)) model = GRU4REC(input_size, hidden_size, output_size, num_layers=num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda, loss_type=loss_type, optimizer_type=optimizer_type, lr=lr, momentum=momentum, time_sort=time_sort, pretrained=gru) model.init_data(df_train, df_test, session_key=session_key, time_key=time_key, item_key=item_key) k = args.k recall, mrr = model.test(k=k, batch_size=batch_size) result = f'Recall@{k}:{recall:.7f},MRR@{k}:{mrr:.7f}' print(result)
class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, p_dropout_input=.0, p_dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 p_dropout_input (float): dropout probability for the input layer p_dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda self.device = torch.device('cuda' if use_cuda else 'cpu') if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, p_dropout_input=p_dropout_input, p_dropout_hidden=p_dropout_hidden, batch_size=batch_size, use_cuda=use_cuda) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def run_epoch(self, dataset, k=20, training=True): """ Run a single training epoch """ start_time = time.time() # initialize losses = [] recalls = [] mrrs = [] optimizer = self.optimizer hidden = self.gru.init_hidden() if not training: self.gru.eval() device = self.device def reset_hidden(hidden, mask): """Helper function that resets hidden state when some sessions terminate""" if len(mask) != 0: hidden[:, mask, :] = 0 return hidden # Start the training loop loader = SessionDataLoader(dataset, batch_size=self.batch_size) for input, target, mask in loader: input = input.to(device) target = target.to(device) # device_desktop.to(device) # reset the hidden states if some sessions have just terminated hidden = reset_hidden(hidden, mask).detach() # Go through the GRU layer logit, hidden = self.gru(input, target, hidden) # Output sampling logit_sampled = logit[:, target.view(-1)] # Calculate the mini-batch loss loss = self.loss_fn(logit_sampled) with torch.no_grad(): recall, mrr = E.evaluate(logit, target, k) losses.append(loss.item()) recalls.append(recall) mrrs.append(mrr) # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD if training: # Backprop loss.backward() optimizer.step() optimizer.zero_grad() # flush the gradient after the optimization results = dict() results['loss'] = np.mean(losses) results['recall'] = np.mean(recalls) results['mrr'] = np.mean(mrrs) end_time = time.time() results['time'] = (end_time - start_time) / 60 if not training: self.gru.train() return results def train(self, dataset, k=20, n_epochs=10, save_dir='./models', save=True, model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Training {model_name}...') results = self.run_epoch(dataset, k=k, training=True) results = [f'{k}:{v:.3f}' for k, v in results.items()] tmp0=float(results[1].split(":")[1]) res0=results print(f'epoch:{1:2d}/{"/".join(results)}') for epoch in range(n_epochs-1): results = self.run_epoch(dataset, k=k, training=True) results = [f'{k}:{v:.3f}' for k, v in results.items()] tmp=float(results[1].split(":")[1]) if tmp>tmp0: tmp0=tmp print(f'epoch:{epoch+1:2d}/{"/".join(results)}') else: print("The end of training~") break # Store the intermediate model if save: save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir/model_fname) def test(self, dataset, k=20): """ Model evaluation Args: k (int): the length of the recommendation list Returns: avg_loss: mean of the losses over the session-parallel minibatches avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches wall_clock: time took for testing """ results = self.run_epoch(dataset, k=k, training=False) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(results)
class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def train(self, n_epochs=10, save_dir='./models', model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Model Name:{model_name}') # Time the training process start_time = time.time() for epoch in range(n_epochs): loss = self.run_epoch() end_time = time.time() wall_clock = (end_time - start_time) / 60 print( f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)' ) start_time = time.time() # Store the intermediate model save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir / model_fname) def run_epoch(self): """ Run a single training epoch """ self.gru.train() # initialize mb_losses = [] optimizer = self.optimizer hidden = self.gru.init_hidden().data # Start the training loop loader = SessionDataLoader(df=self.df_train, hidden=hidden, session_key=self.session_key, item_key=self.item_key, time_key=self.time_key, batch_size=self.batch_size, training=self.gru.training, time_sort=self.time_sort) for input, target, hidden in loader.generate_batch(): if self.use_cuda: input = input.cuda() target = target.cuda() # Go through the GRU layer logit, hidden = self.gru(input, target, hidden) ######################## IMPORTANT ######################### # update the hidden state for the dataloader from the outside ############################################################# loader.update_hidden(hidden.data) # Calculate the mini-batch loss mb_loss = self.loss_fn(logit) mb_losses.append(mb_loss.data[0]) # flush the gradient b/f backprop optimizer.zero_grad() # Backprop mb_loss.backward() # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD optimizer.step() avg_epoch_loss = np.mean(mb_losses) return avg_epoch_loss def test(self, k=20, batch_size=50): """ Model evaluation Args: k (int): the length of the recommendation list batch_size (int): testing batch_size Returns: avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches """ # set the gru layer into inference mode self.gru.eval() recalls = [] mrrs = [] hidden = self.gru.init_hidden().data # Start the testing loop loader = SessionDataLoader(df=self.df_test, hidden=hidden, session_key=self.session_key, item_key=self.item_key, time_key=self.time_key, batch_size=batch_size, training=self.gru.training, time_sort=self.time_sort) for input, target, hidden in loader.generate_batch(): if self.use_cuda: input = input.cuda() target = target.cuda() # forward propagation logit, hidden = self.gru(input, target, hidden) # update the hidden state for the dataloader loader.update_hidden(hidden.data) # Evaluate the results recall, mrr = E.evaluate(logit, target, k) recalls.append(recall) mrrs.append(mrr) avg_recall = np.mean(recalls) avg_mrr = np.mean(mrrs) # reset the gru to a training mode self.gru.train() return avg_recall, avg_mrr def init_data(self, df_train, df_test, session_key, time_key, item_key): """ Initialize the training & test data. The training/test set, session/time/item keys will be stored for later reuse. Args: df_train (pd.DataFrame): training set required to retrieve the training item indices. df_test (pd.DataFrame): test set session_key (str): session ID time_key (str): time ID item_key (str): item ID """ # Specify the identifiers self.session_key = session_key self.time_key = time_key self.item_key = item_key # Initialize the dataframes into adequate forms self.df_train = self.init_df(df_train, session_key, time_key, item_key) self.df_test = self.init_df(df_test, session_key, time_key, item_key, item_ids=df_train[item_key].unique()) @staticmethod def init_df(df, session_key, time_key, item_key, item_ids=None): """ Initialize the dataframe. Involves the following steps: 1) Add new item indices to the dataframe 2) Sort the df Args: session_key: session identifier time_key: timestamp item_key: item identifier item_ids: unique item ids. Should be `None` if the df is a training set, and should include the ids for the items included in the training set if the df is a test set. """ # add item index column named "item_idx" to the df if item_ids is None: item_ids = df[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids) df = pd.merge(df, pd.DataFrame({ item_key: item_ids, 'item_idx': item2idx[item_ids].values }), on=item_key, how='inner') """ Sort the df by time, and then by session ID. That is, df is sorted by session ID and clicks within a session are next to each other, where the clicks within a session are time-ordered. """ df.sort_values([session_key, time_key], inplace=True) return df
eps = 1e-6 loss_type = 'CrossEntropy' lr = 0.01 dropout_hidden = 0 dropout_input = 0 batch_size = 50 momentum = 0 gru = GRU(input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers=num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda, cuda_id=cuda_id) print(loss_type + ':') for i in range(1, 21): model_name = 'GRU4REC_CrossEntropy_Adagrad_0.01_epoch%d' % i print(model_name) model_file = r'./models/' + model_name gru.load_state_dict(torch.load(model_file)) model = GRU4REC(input_size, if_embedding,
class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None, n_sample=2048, sample_alpha=0.75, sample_store=10000000, bpreg=1.0): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda self.device = torch.device('cuda' if use_cuda else 'cpu') ###修改### self.n_sample = n_sample self.sample_alpha = sample_alpha self.sample_store = sample_store self.bpreg = bpreg ###修改### if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def generate_neg_samples(self, n_items, pop, length): if self.sample_alpha: sample = np.searchsorted(pop, np.random.rand(self.n_sample * length)) else: # 从 np.arange(n_items) 中产生一个size为n_sample * length的随机采样 sample = np.random.choice(n_items, size=self.n_sample * length) if length > 1: sample = sample.reshape((length, self.n_sample)) return sample def run_epoch(self, dataset, k=20, training=True): """ Run a single training epoch """ start_time = time.time() # initialize losses = [] recalls = [] mrrs = [] ##增加### zippers = [] ##增加### optimizer = self.optimizer hidden = self.gru.init_hidden() if not training: self.gru.eval() device = self.device def reset_hidden(hidden, mask): """Helper function that resets hidden state when some sessions terminate""" if len(mask) != 0: hidden[:, mask, :] = 0 return hidden # Start the training loop loader = SessionDataLoader(dataset, batch_size=self.batch_size) # 一个bach一个bach的迭代,每次迭代是一个input:tensor([ 31, 26, 27, 29, 24]);一个output:tensor([ 31, 26, 28, 17, 24]) # if training == True: n_items = len(dataset.items) # sampling 增加额外负样本采样 if self.n_sample > 0: pop = dataset.df.groupby( 'ItemId').size() # item的流行度supp,数据如下格式 # ItemId # 214507331 1 # 214507365 1 # 将sample_alpha设置为1会导致基于流行度的采样,将其设置为0会导致均匀采样 pop = pop[ dataset.itemmap[dataset.item_key]. values].values**self.sample_alpha # item选择作为样本的概率为supp ^ sample_alpha pop = pop.cumsum() / pop.sum() pop[-1] = 1 if self.sample_store: generate_length = self.sample_store // self.n_sample if generate_length <= 1: sample_store = 0 print('No example store was used') else: neg_samples = self.generate_neg_samples( n_items, pop, generate_length) sample_pointer = 0 else: print('No example store was used') for input, target, mask in loader: input = input.to(device) target = target.to(device) # print(input) # print(target) #额外的 SAMPLING THE OUTPUT if self.n_sample > 0 and training: if self.sample_store: if sample_pointer == generate_length: neg_samples = self.generate_neg_samples( n_items, pop, generate_length) sample_pointer = 0 sample = neg_samples[sample_pointer] sample_pointer += 1 else: sample = self.generate_neg_samples(pop, 1) y = torch.LongTensor(np.hstack([target, sample])) else: y = target #不增加额外采样 # reset the hidden states if some sessions have just terminated hidden = reset_hidden(hidden, mask).detach() # Go through the GRU layer logit, hidden = self.gru(input, target, hidden) # Output sampling #理解,很重要!!!!!!! y = y.to(device) logit_sampled = logit[:, y] # Calculate the mini-batch loss loss = self.loss_fn(logit_sampled) with torch.no_grad(): recall, mrr = E.evaluate(logit, target, k) losses.append(loss.item()) recalls.append(recall) mrrs.append(mrr) # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD if training: # Backprop loss.backward() optimizer.step() optimizer.zero_grad( ) # flush the gradient after the optimization results = dict() results['loss'] = np.mean(losses) results['recall'] = np.mean(recalls) results['mrr'] = np.mean(mrrs) end_time = time.time() results['time'] = (end_time - start_time) / 60 if not training: self.gru.train() return results def train(self, dataset, k=20, n_epochs=10, save_dir='./models', save=True, model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Training {model_name}...') for epoch in range(n_epochs): results = self.run_epoch(dataset, k=k, training=True) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(f'epoch:{epoch+1:2d}/{"/".join(results)}') # Store the intermediate model if save: save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir / model_fname) def test(self, dataset, k=20): """ Model evaluation Args: k (int): the length of the recommendation list Returns: avg_loss: mean of the losses over the session-parallel minibatches avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches wall_clock: time took for testing """ results = self.run_epoch(dataset, k=k, training=False) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(f'Test result: {"/".join(results)}')
class GRU4REC: def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.05, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def train(self, df, session_key, time_key, item_key, n_epochs=10, save_dir='./models', model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: df (pd.DataFrame): training dataset session_key (str): session ID time_key (str): time ID item_key (str): item ID n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ df, click_offsets, session_idx_arr = GRU4REC.init_data(df, session_key, time_key, item_key, time_sort=self.time_sort) # Time the training process start_time = time.time() for epoch in range(n_epochs): loss = self.run_epoch(df, click_offsets, session_idx_arr) end_time = time.time() wall_clock = (end_time - start_time) / 60 print(f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)') start_time = time.time() # Store the intermediate model save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir/model_fname) def run_epoch(self, df, click_offsets, session_idx_arr): """ Runs a single training epoch """ mb_losses = [] # initializations iters = np.arange(self.batch_size) maxiter = iters.max() start = click_offsets[session_idx_arr[iters]] end = click_offsets[session_idx_arr[iters]+1] # initialize the hidden state hidden = self.gru.init_hidden().data optimizer = self.optimizer # Start the training loop finished = False while not finished: minlen = (end-start).min() # Item indices(for embedding) for clicks where the first sessions start idx_target = df.item_idx.values[start] for i in range(minlen - 1): # Build inputs, targets, and hidden states idx_input = idx_target idx_target = df.item_idx.values[start + i + 1] input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor target = Variable(torch.LongTensor(idx_target)) #(B) if self.use_cuda: input = input.cuda() target = target.cuda() # Now, convert into an embedded Variable embedded = self.gru.emb(input) hidden = Variable(hidden) # Go through the GRU layer logit, hidden = self.gru(embedded, target, hidden) # Calculate the mini-batch loss mb_loss = self.loss_fn(logit) mb_losses.append(mb_loss.data[0]) # flush the gradient b/f backprop optimizer.zero_grad() # Backprop mb_loss.backward() # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD optimizer.step() # Detach the hidden state for later reuse hidden = hidden.data # click indices where a particular session meets second-to-last element start = start + (minlen - 1) # figure out how many sessions should terminate mask = np.arange(len(iters))[(end-start)<=1] for idx in mask: maxiter += 1 if maxiter >= len(click_offsets)-1: finished = True break # update the next starting/ending point iters[idx] = maxiter start[idx] = click_offsets[session_idx_arr[maxiter]] end[idx] = click_offsets[session_idx_arr[maxiter]+1] # reset the rnn hidden state to zero after transition if len(mask) != 0: hidden[:, mask, :] = 0 avg_epoch_loss = np.mean(mb_losses) return avg_epoch_loss def predict(self, input, target, hidden): """ Forward propagation for testing Args: input (B,C): torch.LongTensor. The one-hot embedding for the item indices target (B,): a Variable that stores the indices for the next items hidden: previous hidden state Returns: logits (B,C): logits for the next items hidden: next hidden state """ # convert the item indices into embeddings embedded = self.gru.emb(input, volatile=True) hidden = Variable(hidden, volatile=True) # forward propagation logits, hidden = self.gru(embedded, target, hidden) return logits, hidden def test(self, df_train, df_test, session_key, time_key, item_key, k=20, batch_size=50): """ Model evaluation Args: df_train (pd.DataFrame): training set required to retrieve the training item indices. df_test (pd.DataFrame): test set session_key (str): session ID time_key (str): time ID item_key (str): item ID k (int): the length of the recommendation list batch_size (int): testing batch_size Returns: avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches """ # set the gru layer into inference mode if self.gru.training: self.gru.switch_mode() recalls = [] mrrs = [] # initializations # Build item2idx from train data. iids = df_train[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(iids)), index=iids) df_test = pd.merge(df_test, pd.DataFrame({item_key: iids, 'item_idx': item2idx[iids].values}), on=item_key, how='inner') # Sort the df by time, and then by session ID. df_test.sort_values([session_key, time_key], inplace=True) # Return the offsets of the beginning clicks of each session IDs click_offsets = GRU4REC.get_click_offsets(df_test, session_key) session_idx_arr = GRU4REC.order_session_idx(df_test, session_key, time_key, time_sort=self.time_sort) iters = np.arange(batch_size) maxiter = iters.max() start = click_offsets[session_idx_arr[iters]] end = click_offsets[session_idx_arr[iters]+1] hidden = self.gru.init_hidden().data # Start the training loop finished = False while not finished: minlen = (end-start).min() # Item indices(for embedding) for clicks where the first sessions start idx_target = df_test.item_idx.values[start] for i in range(minlen - 1): # Build inputs, targets, and hidden states idx_input = idx_target idx_target = df_test.item_idx.values[start + i + 1] input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor target = Variable(torch.LongTensor(idx_target), volatile=True) # (B) if self.use_cuda: input = input.cuda() target = target.cuda() logit, hidden = self.predict(input, target, hidden) recall, mrr = evaluate(logit, target, k) recalls.append(recall) mrrs.append(mrr) # Detach the hidden state for later reuse hidden = hidden.data # click indices where a particular session meets second-to-last element start = start + (minlen - 1) # see if how many sessions should terminate mask = np.arange(len(iters))[(end-start)<=1] for idx in mask: maxiter += 1 if maxiter >= len(click_offsets)-1: finished = True break # update the next starting/ending point iters[idx] = maxiter start[idx] = click_offsets[session_idx_arr[maxiter]] end[idx] = click_offsets[session_idx_arr[maxiter]+1] # reset the rnn hidden state to zero after transition if len(mask)!= 0: hidden[:,mask,:] = 0 avg_recall = np.mean(recalls) avg_mrr = np.mean(mrrs) # reset the gru to a training mode self.gru.switch_mode() return avg_recall, avg_mrr @staticmethod def init_data(df, session_key, time_key, item_key, time_sort): """ Initialize the data. """ # add item indices to the dataframe df = GRU4REC.add_item_indices(df, item_key) """ Sort the df by time, and then by session ID. That is, df is sorted by session ID and clicks within a session are next to each other, where the clicks within a session are time-ordered. """ df.sort_values([session_key, time_key], inplace=True) click_offsets = GRU4REC.get_click_offsets(df, session_key) session_idx_arr = GRU4REC.order_session_idx(df, session_key, time_key, time_sort=time_sort) return df, click_offsets, session_idx_arr @staticmethod def add_item_indices(df, item_key): """ Adds an item index column named "item_idx" to the df. Args: df: pd.DataFrame to add the item indices to Returns: df: copy of the original df with item indices """ iids = df[item_key].unique() # unique item ids item2idx = pd.Series(data=np.arange(len(iids)), index=iids) df = pd.merge(df, pd.DataFrame({item_key: iids, 'item_idx': item2idx[iids].values}), on=item_key, how='inner') return df @staticmethod def get_click_offsets(df, session_key): """ Return the offsets of the beginning clicks of each session IDs, where the offset is calculated against the first click of the first session ID. """ offsets = np.zeros(df[session_key].nunique()+1, dtype=np.int32) # group & sort the df by session_key and get the offset values offsets[1:] = df.groupby(session_key).size().cumsum() return offsets @staticmethod def order_session_idx(df, session_key, time_key, time_sort=False): """ Order the session indices """ if time_sort: # starting time for each sessions, sorted by session IDs sessions_start_time = df.groupby(session_key)[time_key].min().values # order the session indices by session starting times session_idx_arr = np.argsort(sessions_start_time) else: session_idx_arr = np.arange(df[session_key].nunique()) return session_idx_arr
def __init__(self, input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, cuda_id=1, compress=False, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.if_embedding = if_embedding self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda self.cuda_id = cuda_id self.device = torch.device( 'cuda:%d' % cuda_id if use_cuda else 'cpu' ) # must specify cuda_id or it will be torch.cuda.current_device() print(self.device) if pretrained is None: self.gru = GRU(input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda, cuda_id=cuda_id) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.compress = compress self.compression_scheduler = None if self.compress: # Create a CompressionScheduler and configure it from a YAML schedule file source = self.compress self.compression_scheduler = distiller.config.file_config( self.gru, None, self.compress) self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort
class GRU4REC: def __init__(self, input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, cuda_id=1, compress=False, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.if_embedding = if_embedding self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda self.cuda_id = cuda_id self.device = torch.device( 'cuda:%d' % cuda_id if use_cuda else 'cpu' ) # must specify cuda_id or it will be torch.cuda.current_device() print(self.device) if pretrained is None: self.gru = GRU(input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda, cuda_id=cuda_id) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.compress = compress self.compression_scheduler = None if self.compress: # Create a CompressionScheduler and configure it from a YAML schedule file source = self.compress self.compression_scheduler = distiller.config.file_config( self.gru, None, self.compress) self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort def run_epoch(self, dataset, k=20, training=True, compression_scheduler=None, epoch=0): """ Run a single training epoch """ start_time = time.time() # initialize losses = [] recalls = [] mrrs = [] optimizer = self.optimizer hidden = self.gru.init_hidden() if not training: self.gru.eval() device = self.device def reset_hidden(hidden, mask): """Helper function that resets hidden state when some sessions terminate""" if len(mask) != 0: hidden[:, mask, :] = 0 return hidden # Start the training loop loader = SessionDataLoader(dataset, batch_size=self.batch_size) batch_id = 0 steps_per_epoch = math.ceil(len(dataset.items) / self.batch_size) for input, target, mask in loader: batch_id = batch_id + 1 input = input.to(device) target = target.to(device) # reset the hidden states if some sessions have just terminated hidden = reset_hidden(hidden, mask).detach() if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, minibatch_id=batch_id, minibatches_per_epoch=steps_per_epoch) # Go through the GRU layer logit, hidden = self.gru(input, target, hidden) # Output sampling logit_sampled = logit[:, target.view(-1)] # Calculate the mini-batch loss loss = self.loss_fn(logit_sampled) with torch.no_grad(): recall, mrr = E.evaluate(logit, target, k) losses.append(loss.item()) recalls.append(recall) mrrs.append(mrr) # Gradient Clipping(Optional) if self.clip_grad != -1: for p in self.gru.parameters(): p.grad.data.clamp_(max=self.clip_grad) # Mini-batch GD if training: if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) loss = compression_scheduler.before_backward_pass( epoch, minibatch_id=batch_id, minibatches_per_epoch=steps_per_epoch, loss=loss, return_loss_components=False) # Backprop loss.backward() optimizer.step() optimizer.zero_grad( ) # flush the gradient after the optimization if compression_scheduler: compression_scheduler.on_minibatch_end( epoch, minibatch_id=batch_id, minibatches_per_epoch=steps_per_epoch) results = dict() #ipdb.set_trace() results['loss'] = np.mean(losses) results['recall'] = np.mean(recalls) results['mrr'] = np.mean(mrrs) end_time = time.time() results['time'] = (end_time - start_time) / 60 if not training: self.gru.train() return results def train(self, dataset, k=20, n_epochs=20, save_dir='./models', save=True, model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Training {model_name}...') for epoch in range(n_epochs): if self.compression_scheduler: self.compression_scheduler.on_epoch_begin(epoch) results = self.run_epoch( dataset, k=k, training=True, compression_scheduler=self.compression_scheduler, epoch=epoch) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(f'epoch:{epoch+1:2d}/{"/".join(results)}') t, total = distiller.weights_sparsity_tbl_summary( self.gru, return_total_sparsity=True) print("\nParameters:\n" + str(t)) print('Total sparsity: {:0.2f}\n'.format(total)) if self.compression_scheduler: self.compression_scheduler.on_epoch_end(epoch) # Store the intermediate model if save: save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir / model_fname) def test(self, dataset, k=20): """ Model evaluation Args: k (int): the length of the recommendation list Returns: avg_loss: mean of the losses over the session-parallel minibatches avg_recall: mean of the Recall@K over the session-parallel mini-batches avg_mrr: mean of the MRR@K over the session-parallel mini-batches wall_clock: time took for testing """ results = self.run_epoch(dataset, k=k, training=False) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(f'Test result: {"/".join(results)}')