Пример #1
0
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort
Пример #2
0
def main():
    parser = argparse.ArgumentParser()

    # Model filename
    parser.add_argument('model_file', type=str)

    # Size of the recommendation list
    parser.add_argument('--k', default=20, type=int)

    # parse the nn arguments

    parser.add_argument('--hidden_size', default=100, type=int)
    parser.add_argument('--num_layers', default=1, type=int)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--dropout_input', default=0, type=float)
    parser.add_argument('--dropout_hidden', default=.5, type=float)

    # parse the optimizer arguments
    parser.add_argument('--optimizer_type', default='Adagrad', type=str)
    parser.add_argument('--lr', default=.01, type=float)
    parser.add_argument('--weight_decay', default=0, type=float)
    parser.add_argument('--momentum', default=0, type=float)
    parser.add_argument('--eps', default=1e-6, type=float)

    # parse the loss type
    parser.add_argument('--loss_type', default='TOP1', type=str)

    # etc
    parser.add_argument('--n_epochs', default=10, type=int)
    parser.add_argument('--time_sort', default=False, type=bool)
    parser.add_argument('--n_samples', default=-1, type=int)

    # Get the arguments
    args = parser.parse_args()

    PATH_DATA = Path('./data')
    PATH_MODEL = Path('./models')
    train = 'train.tsv'
    test = 'test.tsv'
    PATH_TRAIN = PATH_DATA / train
    PATH_TEST = PATH_DATA / test

    df_train = pd.read_csv(PATH_TRAIN,
                           sep='\t',
                           names=['SessionId', 'ItemId', 'TimeStamp'])
    df_test = pd.read_csv(PATH_TEST,
                          sep='\t',
                          names=['SessionId', 'ItemId', 'TimeStamp'])

    # sampling, if needed
    n_samples = args.n_samples
    if n_samples != -1:
        df_train = df_train[:n_samples]
        df_test = df_test[:n_samples]

    session_key = 'SessionId'
    item_key = 'ItemId'
    time_key = 'TimeStamp'

    use_cuda = True
    input_size = df_train[item_key].nunique()
    hidden_size = args.hidden_size
    num_layers = args.num_layers
    output_size = input_size
    batch_size = args.batch_size
    dropout_input = args.dropout_input
    dropout_hidden = args.dropout_hidden

    loss_type = args.loss_type

    optimizer_type = args.optimizer_type
    lr = args.lr
    weight_decay = args.weight_decay
    momentum = args.momentum
    eps = args.eps

    n_epochs = args.n_epochs
    time_sort = args.time_sort

    MODEL_FILE = PATH_MODEL / args.model_file

    gru = GRU(input_size,
              hidden_size,
              output_size,
              num_layers=num_layers,
              dropout_input=dropout_input,
              dropout_hidden=dropout_hidden,
              batch_size=batch_size,
              use_cuda=use_cuda)

    gru.load_state_dict(torch.load(MODEL_FILE))

    model = GRU4REC(input_size,
                    hidden_size,
                    output_size,
                    num_layers=num_layers,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_hidden,
                    batch_size=batch_size,
                    use_cuda=use_cuda,
                    loss_type=loss_type,
                    optimizer_type=optimizer_type,
                    lr=lr,
                    momentum=momentum,
                    time_sort=time_sort,
                    pretrained=gru)

    model.init_data(df_train,
                    df_test,
                    session_key=session_key,
                    time_key=time_key,
                    item_key=item_key)

    k = args.k
    recall, mrr = model.test(k=k, batch_size=batch_size)
    result = f'Recall@{k}:{recall:.7f},MRR@{k}:{mrr:.7f}'
    print(result)
Пример #3
0
class GRU4REC:
    def __init__(self, input_size, hidden_size, output_size, num_layers=1,
                 optimizer_type='Adagrad', lr=.01, weight_decay=0,
                 momentum=0, eps=1e-6, loss_type='TOP1',
                 clip_grad=-1, p_dropout_input=.0, p_dropout_hidden=.5,
                 batch_size=50, use_cuda=True, time_sort=False, pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            p_dropout_input (float): dropout probability for the input layer
            p_dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """
        
        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        if pretrained is None:
            self.gru = GRU(input_size, hidden_size, output_size, num_layers,
                           p_dropout_input=p_dropout_input,
                           p_dropout_hidden=p_dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type)

        # gradient clipping(optional)
        self.clip_grad = clip_grad 

        # etc
        self.time_sort = time_sort
        
        
    def run_epoch(self, dataset, k=20, training=True):
        """ Run a single training epoch """
        start_time = time.time()
        # initialize
        losses = []
        recalls = []
        mrrs = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device
        
        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0
            
            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        for input, target, mask in loader:
            input = input.to(device)
            target = target.to(device)
            # device_desktop.to(device)
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling
            logit_sampled = logit[:, target.view(-1)]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())         
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad() # flush the gradient after the optimization

        results = dict()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)
        
        end_time = time.time()
        results['time'] = (end_time - start_time) / 60
        
        if not training:
            self.gru.train()

        return results
    
    
    def train(self, dataset, k=20, n_epochs=10, save_dir='./models', save=True, model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')
        results = self.run_epoch(dataset, k=k, training=True)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        tmp0=float(results[1].split(":")[1])
        res0=results
        print(f'epoch:{1:2d}/{"/".join(results)}')
        for epoch in range(n_epochs-1):
            results = self.run_epoch(dataset, k=k, training=True)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            tmp=float(results[1].split(":")[1])
            if tmp>tmp0:
              tmp0=tmp
              print(f'epoch:{epoch+1:2d}/{"/".join(results)}')
            else:
              print("The end of training~")
              break
            
            
            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir/model_fname)
    

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(results)
Пример #4
0
class GRU4REC:
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def train(self, n_epochs=10, save_dir='./models', model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Model Name:{model_name}')
        # Time the training process
        start_time = time.time()
        for epoch in range(n_epochs):
            loss = self.run_epoch()
            end_time = time.time()
            wall_clock = (end_time - start_time) / 60
            print(
                f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)'
            )
            start_time = time.time()

            # Store the intermediate model
            save_dir = Path(save_dir)
            if not save_dir.exists(): save_dir.mkdir()

            model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
            torch.save(self.gru.state_dict(), save_dir / model_fname)

    def run_epoch(self):
        """ Run a single training epoch """
        self.gru.train()
        # initialize
        mb_losses = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden().data

        # Start the training loop
        loader = SessionDataLoader(df=self.df_train,
                                   hidden=hidden,
                                   session_key=self.session_key,
                                   item_key=self.item_key,
                                   time_key=self.time_key,
                                   batch_size=self.batch_size,
                                   training=self.gru.training,
                                   time_sort=self.time_sort)

        for input, target, hidden in loader.generate_batch():
            if self.use_cuda:
                input = input.cuda()
                target = target.cuda()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            ######################## IMPORTANT  #########################
            # update the hidden state for the dataloader from the outside
            #############################################################
            loader.update_hidden(hidden.data)
            # Calculate the mini-batch loss
            mb_loss = self.loss_fn(logit)
            mb_losses.append(mb_loss.data[0])
            # flush the gradient b/f backprop
            optimizer.zero_grad()
            # Backprop
            mb_loss.backward()
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            optimizer.step()

        avg_epoch_loss = np.mean(mb_losses)

        return avg_epoch_loss

    def test(self, k=20, batch_size=50):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list
            batch_size (int): testing batch_size

        Returns:
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
        """
        # set the gru layer into inference mode
        self.gru.eval()

        recalls = []
        mrrs = []
        hidden = self.gru.init_hidden().data

        # Start the testing loop
        loader = SessionDataLoader(df=self.df_test,
                                   hidden=hidden,
                                   session_key=self.session_key,
                                   item_key=self.item_key,
                                   time_key=self.time_key,
                                   batch_size=batch_size,
                                   training=self.gru.training,
                                   time_sort=self.time_sort)

        for input, target, hidden in loader.generate_batch():
            if self.use_cuda:
                input = input.cuda()
                target = target.cuda()
            # forward propagation
            logit, hidden = self.gru(input, target, hidden)
            # update the hidden state for the dataloader
            loader.update_hidden(hidden.data)
            # Evaluate the results
            recall, mrr = E.evaluate(logit, target, k)
            recalls.append(recall)
            mrrs.append(mrr)

        avg_recall = np.mean(recalls)
        avg_mrr = np.mean(mrrs)

        # reset the gru to a training mode
        self.gru.train()

        return avg_recall, avg_mrr

    def init_data(self, df_train, df_test, session_key, time_key, item_key):
        """ Initialize the training & test data.

        The training/test set, session/time/item keys will be stored for later reuse.

        Args:
            df_train (pd.DataFrame): training set required to retrieve the training item indices.
            df_test (pd.DataFrame): test set
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
        """

        # Specify the identifiers
        self.session_key = session_key
        self.time_key = time_key
        self.item_key = item_key

        # Initialize the dataframes into adequate forms
        self.df_train = self.init_df(df_train, session_key, time_key, item_key)
        self.df_test = self.init_df(df_test,
                                    session_key,
                                    time_key,
                                    item_key,
                                    item_ids=df_train[item_key].unique())

    @staticmethod
    def init_df(df, session_key, time_key, item_key, item_ids=None):
        """ Initialize the dataframe.

        Involves the following steps:
            1) Add new item indices to the dataframe
            2) Sort the df

        Args:
            session_key: session identifier
            time_key: timestamp
            item_key: item identifier
            item_ids: unique item ids. Should be `None` if the df is a training set, and should include the
                  ids for the items included in the training set if the df is a test set.
        """

        # add item index column named "item_idx" to the df
        if item_ids is None:
            item_ids = df[item_key].unique()  # unique item ids
        item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids)
        df = pd.merge(df,
                      pd.DataFrame({
                          item_key: item_ids,
                          'item_idx': item2idx[item_ids].values
                      }),
                      on=item_key,
                      how='inner')
        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """
        df.sort_values([session_key, time_key], inplace=True)

        return df
Пример #5
0
eps = 1e-6

loss_type = 'CrossEntropy'
lr = 0.01

dropout_hidden = 0
dropout_input = 0
batch_size = 50
momentum = 0

gru = GRU(input_size,
          if_embedding,
          embedding_size,
          hidden_size,
          output_size,
          num_layers=num_layers,
          dropout_input=dropout_input,
          dropout_hidden=dropout_hidden,
          batch_size=batch_size,
          use_cuda=use_cuda,
          cuda_id=cuda_id)

print(loss_type + ':')
for i in range(1, 21):
    model_name = 'GRU4REC_CrossEntropy_Adagrad_0.01_epoch%d' % i
    print(model_name)
    model_file = r'./models/' + model_name
    gru.load_state_dict(torch.load(model_file))

    model = GRU4REC(input_size,
                    if_embedding,
Пример #6
0
class GRU4REC:
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None,
                 n_sample=2048,
                 sample_alpha=0.75,
                 sample_store=10000000,
                 bpreg=1.0):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        ###修改###
        self.n_sample = n_sample
        self.sample_alpha = sample_alpha
        self.sample_store = sample_store
        self.bpreg = bpreg
        ###修改###
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def generate_neg_samples(self, n_items, pop, length):
        if self.sample_alpha:
            sample = np.searchsorted(pop,
                                     np.random.rand(self.n_sample * length))
        else:
            # 从 np.arange(n_items) 中产生一个size为n_sample * length的随机采样
            sample = np.random.choice(n_items, size=self.n_sample * length)
        if length > 1:
            sample = sample.reshape((length, self.n_sample))
        return sample

    def run_epoch(self, dataset, k=20, training=True):
        """ Run a single training epoch """
        start_time = time.time()

        # initialize
        losses = []
        recalls = []
        mrrs = []
        ##增加###
        zippers = []
        ##增加###
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0

            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        # 一个bach一个bach的迭代,每次迭代是一个input:tensor([ 31,  26,  27,  29,  24]);一个output:tensor([ 31,  26,  28,  17,  24])
        #
        if training == True:
            n_items = len(dataset.items)
            # sampling 增加额外负样本采样
            if self.n_sample > 0:
                pop = dataset.df.groupby(
                    'ItemId').size()  # item的流行度supp,数据如下格式
                # ItemId
                # 214507331  1
                # 214507365  1
                # 将sample_alpha设置为1会导致基于流行度的采样,将其设置为0会导致均匀采样
                pop = pop[
                    dataset.itemmap[dataset.item_key].
                    values].values**self.sample_alpha  # item选择作为样本的概率为supp ^ sample_alpha
                pop = pop.cumsum() / pop.sum()
                pop[-1] = 1
                if self.sample_store:
                    generate_length = self.sample_store // self.n_sample
                    if generate_length <= 1:
                        sample_store = 0
                        print('No example store was used')
                    else:
                        neg_samples = self.generate_neg_samples(
                            n_items, pop, generate_length)
                        sample_pointer = 0
                else:
                    print('No example store was used')

        for input, target, mask in loader:
            input = input.to(device)
            target = target.to(device)
            # print(input)
            # print(target)
            #额外的 SAMPLING THE OUTPUT
            if self.n_sample > 0 and training:
                if self.sample_store:
                    if sample_pointer == generate_length:
                        neg_samples = self.generate_neg_samples(
                            n_items, pop, generate_length)
                        sample_pointer = 0
                    sample = neg_samples[sample_pointer]
                    sample_pointer += 1
                else:
                    sample = self.generate_neg_samples(pop, 1)
                y = torch.LongTensor(np.hstack([target, sample]))
            else:
                y = target  #不增加额外采样
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling   #理解,很重要!!!!!!!
            y = y.to(device)
            logit_sampled = logit[:, y]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad(
                )  # flush the gradient after the optimization

        results = dict()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)

        end_time = time.time()
        results['time'] = (end_time - start_time) / 60

        if not training:
            self.gru.train()

        return results

    def train(self,
              dataset,
              k=20,
              n_epochs=10,
              save_dir='./models',
              save=True,
              model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')

        for epoch in range(n_epochs):
            results = self.run_epoch(dataset, k=k, training=True)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            print(f'epoch:{epoch+1:2d}/{"/".join(results)}')

            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir / model_fname)

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(f'Test result: {"/".join(results)}')
Пример #7
0
class GRU4REC:
    def __init__(self, input_size, hidden_size, output_size, num_layers=1,
                 optimizer_type='Adagrad', lr=.05, weight_decay=0,
                 momentum=0, eps=1e-6, loss_type='TOP1',
                 clip_grad=-1, dropout_input=.0, dropout_hidden=.5,
                 batch_size=50, use_cuda=True, time_sort=False, pretrained=None):

        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size, hidden_size, output_size, num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def train(self, df, session_key, time_key, item_key, n_epochs=10, save_dir='./models', model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            df (pd.DataFrame): training dataset
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        df, click_offsets, session_idx_arr = GRU4REC.init_data(df, session_key, time_key, item_key,
                                                               time_sort=self.time_sort)
        # Time the training process
        start_time = time.time()
        for epoch in range(n_epochs):
            loss = self.run_epoch(df, click_offsets, session_idx_arr)
            end_time = time.time()
            wall_clock = (end_time - start_time) / 60
            print(f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)')
            start_time = time.time()

            # Store the intermediate model
            save_dir = Path(save_dir)
            if not save_dir.exists(): save_dir.mkdir()

            model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
            torch.save(self.gru.state_dict(), save_dir/model_fname)

    def run_epoch(self, df, click_offsets, session_idx_arr):
        """ Runs a single training epoch """
        mb_losses = []
        # initializations
        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters]+1]
        # initialize the hidden state
        hidden = self.gru.init_hidden().data

        optimizer = self.optimizer

        # Start the training loop
        finished = False
        while not finished:
            minlen = (end-start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs, targets, and hidden states
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor
                target = Variable(torch.LongTensor(idx_target)) #(B)
                if self.use_cuda:
                    input = input.cuda()
                    target = target.cuda()
                # Now, convert into an embedded Variable
                embedded = self.gru.emb(input)
                hidden = Variable(hidden)

                # Go through the GRU layer
                logit, hidden = self.gru(embedded, target, hidden)

                # Calculate the mini-batch loss
                mb_loss = self.loss_fn(logit)
                mb_losses.append(mb_loss.data[0])

                # flush the gradient b/f backprop
                optimizer.zero_grad()

                # Backprop
                mb_loss.backward()

                # Gradient Clipping(Optional)
                if self.clip_grad != -1:
                    for p in self.gru.parameters():
                        p.grad.data.clamp_(max=self.clip_grad)

                # Mini-batch GD
                optimizer.step()

                # Detach the hidden state for later reuse
                hidden = hidden.data

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # figure out how many sessions should terminate
            mask = np.arange(len(iters))[(end-start)<=1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets)-1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter]+1]

            # reset the rnn hidden state to zero after transition
            if len(mask) != 0:
                hidden[:, mask, :] = 0

        avg_epoch_loss = np.mean(mb_losses)

        return avg_epoch_loss

    def predict(self, input, target, hidden):
        """ Forward propagation for testing

        Args:
            input (B,C): torch.LongTensor. The one-hot embedding for the item indices
            target (B,): a Variable that stores the indices for the next items
            hidden: previous hidden state

        Returns:
            logits (B,C): logits for the next items
            hidden: next hidden state
        """
        # convert the item indices into embeddings
        embedded = self.gru.emb(input, volatile=True)
        hidden = Variable(hidden, volatile=True)
        # forward propagation
        logits, hidden = self.gru(embedded, target, hidden)

        return logits, hidden

    def test(self, df_train, df_test, session_key, time_key, item_key,
             k=20, batch_size=50):
        """ Model evaluation

        Args:
            df_train (pd.DataFrame): training set required to retrieve the training item indices.
            df_test (pd.DataFrame): test set
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
            k (int): the length of the recommendation list
            batch_size (int): testing batch_size

        Returns:
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
        """
        # set the gru layer into inference mode
        if self.gru.training:
            self.gru.switch_mode()

        recalls = []
        mrrs = []

        # initializations
        # Build item2idx from train data.
        iids = df_train[item_key].unique() # unique item ids
        item2idx = pd.Series(data=np.arange(len(iids)), index=iids)
        df_test = pd.merge(df_test,
                           pd.DataFrame({item_key: iids,
                                         'item_idx': item2idx[iids].values}),
                           on=item_key,
                           how='inner')
        # Sort the df by time, and then by session ID.
        df_test.sort_values([session_key, time_key], inplace=True)
        # Return the offsets of the beginning clicks of each session IDs
        click_offsets = GRU4REC.get_click_offsets(df_test, session_key)
        session_idx_arr = GRU4REC.order_session_idx(df_test, session_key, time_key, time_sort=self.time_sort)

        iters = np.arange(batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters]+1]
        hidden = self.gru.init_hidden().data

        # Start the training loop
        finished = False
        while not finished:
            minlen = (end-start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df_test.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs, targets, and hidden states
                idx_input = idx_target
                idx_target = df_test.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor
                target = Variable(torch.LongTensor(idx_target), volatile=True)  # (B)
                if self.use_cuda:
                    input = input.cuda()
                    target = target.cuda()

                logit, hidden = self.predict(input, target, hidden)
                recall, mrr = evaluate(logit, target, k)
                recalls.append(recall)
                mrrs.append(mrr)

                # Detach the hidden state for later reuse
                hidden = hidden.data

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end-start)<=1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets)-1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter]+1]

            # reset the rnn hidden state to zero after transition
            if len(mask)!= 0:
                hidden[:,mask,:] = 0

        avg_recall = np.mean(recalls)
        avg_mrr = np.mean(mrrs)

        # reset the gru to a training mode
        self.gru.switch_mode()

        return avg_recall, avg_mrr

    @staticmethod
    def init_data(df, session_key, time_key, item_key, time_sort):
        """
        Initialize the data.
        """

        # add item indices to the dataframe
        df = GRU4REC.add_item_indices(df, item_key)

        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and 
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """
        df.sort_values([session_key, time_key], inplace=True)

        click_offsets = GRU4REC.get_click_offsets(df, session_key)
        session_idx_arr = GRU4REC.order_session_idx(df, session_key, time_key, time_sort=time_sort)

        return df, click_offsets, session_idx_arr

    @staticmethod
    def add_item_indices(df, item_key):
        """
        Adds an item index column named "item_idx" to the df.

        Args:
            df: pd.DataFrame to add the item indices to

        Returns:
            df: copy of the original df with item indices
        """
        iids = df[item_key].unique() # unique item ids
        item2idx = pd.Series(data=np.arange(len(iids)), index=iids)
        df = pd.merge(df,
                      pd.DataFrame({item_key: iids,
                                    'item_idx': item2idx[iids].values}),
                      on=item_key,
                      how='inner')
        return df

    @staticmethod
    def get_click_offsets(df, session_key):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """

        offsets = np.zeros(df[session_key].nunique()+1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = df.groupby(session_key).size().cumsum()

        return offsets

    @staticmethod
    def order_session_idx(df, session_key, time_key, time_sort=False):
        """ Order the session indices """

        if time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = df.groupby(session_key)[time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(df[session_key].nunique())

        return session_idx_arr
Пример #8
0
    def __init__(self,
                 input_size,
                 if_embedding,
                 embedding_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 cuda_id=1,
                 compress=False,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.if_embedding = if_embedding
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.cuda_id = cuda_id
        self.device = torch.device(
            'cuda:%d' % cuda_id if use_cuda else 'cpu'
        )  # must specify cuda_id or it will be torch.cuda.current_device()
        print(self.device)
        if pretrained is None:
            self.gru = GRU(input_size,
                           if_embedding,
                           embedding_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda,
                           cuda_id=cuda_id)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps

        self.compress = compress
        self.compression_scheduler = None
        if self.compress:
            # Create a CompressionScheduler and configure it from a YAML schedule file
            source = self.compress
            self.compression_scheduler = distiller.config.file_config(
                self.gru, None, self.compress)

        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort
Пример #9
0
class GRU4REC:
    def __init__(self,
                 input_size,
                 if_embedding,
                 embedding_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 cuda_id=1,
                 compress=False,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.if_embedding = if_embedding
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.cuda_id = cuda_id
        self.device = torch.device(
            'cuda:%d' % cuda_id if use_cuda else 'cpu'
        )  # must specify cuda_id or it will be torch.cuda.current_device()
        print(self.device)
        if pretrained is None:
            self.gru = GRU(input_size,
                           if_embedding,
                           embedding_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda,
                           cuda_id=cuda_id)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps

        self.compress = compress
        self.compression_scheduler = None
        if self.compress:
            # Create a CompressionScheduler and configure it from a YAML schedule file
            source = self.compress
            self.compression_scheduler = distiller.config.file_config(
                self.gru, None, self.compress)

        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def run_epoch(self,
                  dataset,
                  k=20,
                  training=True,
                  compression_scheduler=None,
                  epoch=0):
        """ Run a single training epoch """
        start_time = time.time()

        # initialize
        losses = []
        recalls = []
        mrrs = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0

            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        batch_id = 0
        steps_per_epoch = math.ceil(len(dataset.items) / self.batch_size)

        for input, target, mask in loader:
            batch_id = batch_id + 1
            input = input.to(device)
            target = target.to(device)
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(
                    epoch,
                    minibatch_id=batch_id,
                    minibatches_per_epoch=steps_per_epoch)
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling
            logit_sampled = logit[:, target.view(-1)]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                if compression_scheduler:
                    # Before running the backward phase, we allow the scheduler to modify the loss
                    # (e.g. add regularization loss)
                    loss = compression_scheduler.before_backward_pass(
                        epoch,
                        minibatch_id=batch_id,
                        minibatches_per_epoch=steps_per_epoch,
                        loss=loss,
                        return_loss_components=False)
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad(
                )  # flush the gradient after the optimization
            if compression_scheduler:
                compression_scheduler.on_minibatch_end(
                    epoch,
                    minibatch_id=batch_id,
                    minibatches_per_epoch=steps_per_epoch)

        results = dict()
        #ipdb.set_trace()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)

        end_time = time.time()
        results['time'] = (end_time - start_time) / 60

        if not training:
            self.gru.train()

        return results

    def train(self,
              dataset,
              k=20,
              n_epochs=20,
              save_dir='./models',
              save=True,
              model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')
        for epoch in range(n_epochs):
            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_begin(epoch)
            results = self.run_epoch(
                dataset,
                k=k,
                training=True,
                compression_scheduler=self.compression_scheduler,
                epoch=epoch)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            print(f'epoch:{epoch+1:2d}/{"/".join(results)}')

            t, total = distiller.weights_sparsity_tbl_summary(
                self.gru, return_total_sparsity=True)
            print("\nParameters:\n" + str(t))
            print('Total sparsity: {:0.2f}\n'.format(total))

            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_end(epoch)

            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir / model_fname)

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(f'Test result: {"/".join(results)}')