Python GRU примеры использования

Язык программирования: Python

Пространство имен/Пакет: modules.layer

Класс/Тип: GRU

Примеров на hotexamples.com: 9

Python GRU - 9 примеров найдено. Это лучшие примеры Python кода для modules.layer.GRU, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GRU(5)

init_hidden(5)

parameters(5)

state_dict(5)

eval(4)

train(4)

emb(2)

load_state_dict(2)

switch_mode(2)

Пример #1

Показать файл

Файл: model.py Проект: skysource-ss/pyGRU4REC

    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

Пример #2

Показать файл

def main():
    parser = argparse.ArgumentParser()

    # Model filename
    parser.add_argument('model_file', type=str)

    # Size of the recommendation list
    parser.add_argument('--k', default=20, type=int)

    # parse the nn arguments

    parser.add_argument('--hidden_size', default=100, type=int)
    parser.add_argument('--num_layers', default=1, type=int)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--dropout_input', default=0, type=float)
    parser.add_argument('--dropout_hidden', default=.5, type=float)

    # parse the optimizer arguments
    parser.add_argument('--optimizer_type', default='Adagrad', type=str)
    parser.add_argument('--lr', default=.01, type=float)
    parser.add_argument('--weight_decay', default=0, type=float)
    parser.add_argument('--momentum', default=0, type=float)
    parser.add_argument('--eps', default=1e-6, type=float)

    # parse the loss type
    parser.add_argument('--loss_type', default='TOP1', type=str)

    # etc
    parser.add_argument('--n_epochs', default=10, type=int)
    parser.add_argument('--time_sort', default=False, type=bool)
    parser.add_argument('--n_samples', default=-1, type=int)

    # Get the arguments
    args = parser.parse_args()

    PATH_DATA = Path('./data')
    PATH_MODEL = Path('./models')
    train = 'train.tsv'
    test = 'test.tsv'
    PATH_TRAIN = PATH_DATA / train
    PATH_TEST = PATH_DATA / test

    df_train = pd.read_csv(PATH_TRAIN,
                           sep='\t',
                           names=['SessionId', 'ItemId', 'TimeStamp'])
    df_test = pd.read_csv(PATH_TEST,
                          sep='\t',
                          names=['SessionId', 'ItemId', 'TimeStamp'])

    # sampling, if needed
    n_samples = args.n_samples
    if n_samples != -1:
        df_train = df_train[:n_samples]
        df_test = df_test[:n_samples]

    session_key = 'SessionId'
    item_key = 'ItemId'
    time_key = 'TimeStamp'

    use_cuda = True
    input_size = df_train[item_key].nunique()
    hidden_size = args.hidden_size
    num_layers = args.num_layers
    output_size = input_size
    batch_size = args.batch_size
    dropout_input = args.dropout_input
    dropout_hidden = args.dropout_hidden

    loss_type = args.loss_type

    optimizer_type = args.optimizer_type
    lr = args.lr
    weight_decay = args.weight_decay
    momentum = args.momentum
    eps = args.eps

    n_epochs = args.n_epochs
    time_sort = args.time_sort

    MODEL_FILE = PATH_MODEL / args.model_file

    gru = GRU(input_size,
              hidden_size,
              output_size,
              num_layers=num_layers,
              dropout_input=dropout_input,
              dropout_hidden=dropout_hidden,
              batch_size=batch_size,
              use_cuda=use_cuda)

    gru.load_state_dict(torch.load(MODEL_FILE))

    model = GRU4REC(input_size,
                    hidden_size,
                    output_size,
                    num_layers=num_layers,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_hidden,
                    batch_size=batch_size,
                    use_cuda=use_cuda,
                    loss_type=loss_type,
                    optimizer_type=optimizer_type,
                    lr=lr,
                    momentum=momentum,
                    time_sort=time_sort,
                    pretrained=gru)

    model.init_data(df_train,
                    df_test,
                    session_key=session_key,
                    time_key=time_key,
                    item_key=item_key)

    k = args.k
    recall, mrr = model.test(k=k, batch_size=batch_size)
    result = f'Recall@{k}:{recall:.7f},MRR@{k}:{mrr:.7f}'
    print(result)

Пример #3

Показать файл

Файл: model.py Проект: Gilemore/NewsRecommendation

class GRU4REC:
    def __init__(self, input_size, hidden_size, output_size, num_layers=1,
                 optimizer_type='Adagrad', lr=.01, weight_decay=0,
                 momentum=0, eps=1e-6, loss_type='TOP1',
                 clip_grad=-1, p_dropout_input=.0, p_dropout_hidden=.5,
                 batch_size=50, use_cuda=True, time_sort=False, pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            p_dropout_input (float): dropout probability for the input layer
            p_dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """
        
        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        if pretrained is None:
            self.gru = GRU(input_size, hidden_size, output_size, num_layers,
                           p_dropout_input=p_dropout_input,
                           p_dropout_hidden=p_dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type)

        # gradient clipping(optional)
        self.clip_grad = clip_grad 

        # etc
        self.time_sort = time_sort
        
        
    def run_epoch(self, dataset, k=20, training=True):
        """ Run a single training epoch """
        start_time = time.time()
        # initialize
        losses = []
        recalls = []
        mrrs = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device
        
        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0
            
            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        for input, target, mask in loader:
            input = input.to(device)
            target = target.to(device)
            # device_desktop.to(device)
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling
            logit_sampled = logit[:, target.view(-1)]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())         
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad() # flush the gradient after the optimization

        results = dict()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)
        
        end_time = time.time()
        results['time'] = (end_time - start_time) / 60
        
        if not training:
            self.gru.train()

        return results
    
    
    def train(self, dataset, k=20, n_epochs=10, save_dir='./models', save=True, model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')
        results = self.run_epoch(dataset, k=k, training=True)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        tmp0=float(results[1].split(":")[1])
        res0=results
        print(f'epoch:{1:2d}/{"/".join(results)}')
        for epoch in range(n_epochs-1):
            results = self.run_epoch(dataset, k=k, training=True)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            tmp=float(results[1].split(":")[1])
            if tmp>tmp0:
              tmp0=tmp
              print(f'epoch:{epoch+1:2d}/{"/".join(results)}')
            else:
              print("The end of training~")
              break
            
            
            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir/model_fname)
    

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(results)

Пример #4

Показать файл

Файл: model.py Проект: skysource-ss/pyGRU4REC

class GRU4REC:
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def train(self, n_epochs=10, save_dir='./models', model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Model Name:{model_name}')
        # Time the training process
        start_time = time.time()
        for epoch in range(n_epochs):
            loss = self.run_epoch()
            end_time = time.time()
            wall_clock = (end_time - start_time) / 60
            print(
                f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)'
            )
            start_time = time.time()

            # Store the intermediate model
            save_dir = Path(save_dir)
            if not save_dir.exists(): save_dir.mkdir()

            model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
            torch.save(self.gru.state_dict(), save_dir / model_fname)

    def run_epoch(self):
        """ Run a single training epoch """
        self.gru.train()
        # initialize
        mb_losses = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden().data

        # Start the training loop
        loader = SessionDataLoader(df=self.df_train,
                                   hidden=hidden,
                                   session_key=self.session_key,
                                   item_key=self.item_key,
                                   time_key=self.time_key,
                                   batch_size=self.batch_size,
                                   training=self.gru.training,
                                   time_sort=self.time_sort)

        for input, target, hidden in loader.generate_batch():
            if self.use_cuda:
                input = input.cuda()
                target = target.cuda()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            ######################## IMPORTANT  #########################
            # update the hidden state for the dataloader from the outside
            #############################################################
            loader.update_hidden(hidden.data)
            # Calculate the mini-batch loss
            mb_loss = self.loss_fn(logit)
            mb_losses.append(mb_loss.data[0])
            # flush the gradient b/f backprop
            optimizer.zero_grad()
            # Backprop
            mb_loss.backward()
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            optimizer.step()

        avg_epoch_loss = np.mean(mb_losses)

        return avg_epoch_loss

    def test(self, k=20, batch_size=50):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list
            batch_size (int): testing batch_size

        Returns:
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
        """
        # set the gru layer into inference mode
        self.gru.eval()

        recalls = []
        mrrs = []
        hidden = self.gru.init_hidden().data

        # Start the testing loop
        loader = SessionDataLoader(df=self.df_test,
                                   hidden=hidden,
                                   session_key=self.session_key,
                                   item_key=self.item_key,
                                   time_key=self.time_key,
                                   batch_size=batch_size,
                                   training=self.gru.training,
                                   time_sort=self.time_sort)

        for input, target, hidden in loader.generate_batch():
            if self.use_cuda:
                input = input.cuda()
                target = target.cuda()
            # forward propagation
            logit, hidden = self.gru(input, target, hidden)
            # update the hidden state for the dataloader
            loader.update_hidden(hidden.data)
            # Evaluate the results
            recall, mrr = E.evaluate(logit, target, k)
            recalls.append(recall)
            mrrs.append(mrr)

        avg_recall = np.mean(recalls)
        avg_mrr = np.mean(mrrs)

        # reset the gru to a training mode
        self.gru.train()

        return avg_recall, avg_mrr

    def init_data(self, df_train, df_test, session_key, time_key, item_key):
        """ Initialize the training & test data.

        The training/test set, session/time/item keys will be stored for later reuse.

        Args:
            df_train (pd.DataFrame): training set required to retrieve the training item indices.
            df_test (pd.DataFrame): test set
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
        """

        # Specify the identifiers
        self.session_key = session_key
        self.time_key = time_key
        self.item_key = item_key

        # Initialize the dataframes into adequate forms
        self.df_train = self.init_df(df_train, session_key, time_key, item_key)
        self.df_test = self.init_df(df_test,
                                    session_key,
                                    time_key,
                                    item_key,
                                    item_ids=df_train[item_key].unique())

    @staticmethod
    def init_df(df, session_key, time_key, item_key, item_ids=None):
        """ Initialize the dataframe.

        Involves the following steps:
            1) Add new item indices to the dataframe
            2) Sort the df

        Args:
            session_key: session identifier
            time_key: timestamp
            item_key: item identifier
            item_ids: unique item ids. Should be `None` if the df is a training set, and should include the
                  ids for the items included in the training set if the df is a test set.
        """

        # add item index column named "item_idx" to the df
        if item_ids is None:
            item_ids = df[item_key].unique()  # unique item ids
        item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids)
        df = pd.merge(df,
                      pd.DataFrame({
                          item_key: item_ids,
                          'item_idx': item2idx[item_ids].values
                      }),
                      on=item_key,
                      how='inner')
        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """
        df.sort_values([session_key, time_key], inplace=True)

        return df

Пример #5

Показать файл

Файл: test.py Проект: hanjialiang/DeepRec

eps = 1e-6

loss_type = 'CrossEntropy'
lr = 0.01

dropout_hidden = 0
dropout_input = 0
batch_size = 50
momentum = 0

gru = GRU(input_size,
          if_embedding,
          embedding_size,
          hidden_size,
          output_size,
          num_layers=num_layers,
          dropout_input=dropout_input,
          dropout_hidden=dropout_hidden,
          batch_size=batch_size,
          use_cuda=use_cuda,
          cuda_id=cuda_id)

print(loss_type + ':')
for i in range(1, 21):
    model_name = 'GRU4REC_CrossEntropy_Adagrad_0.01_epoch%d' % i
    print(model_name)
    model_file = r'./models/' + model_name
    gru.load_state_dict(torch.load(model_file))

    model = GRU4REC(input_size,
                    if_embedding,

Пример #6

Показать файл

class GRU4REC:
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 time_sort=False,
                 pretrained=None,
                 n_sample=2048,
                 sample_alpha=0.75,
                 sample_store=10000000,
                 bpreg=1.0):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        ###修改###
        self.n_sample = n_sample
        self.sample_alpha = sample_alpha
        self.sample_store = sample_store
        self.bpreg = bpreg
        ###修改###
        if pretrained is None:
            self.gru = GRU(input_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def generate_neg_samples(self, n_items, pop, length):
        if self.sample_alpha:
            sample = np.searchsorted(pop,
                                     np.random.rand(self.n_sample * length))
        else:
            # 从 np.arange(n_items) 中产生一个size为n_sample * length的随机采样
            sample = np.random.choice(n_items, size=self.n_sample * length)
        if length > 1:
            sample = sample.reshape((length, self.n_sample))
        return sample

    def run_epoch(self, dataset, k=20, training=True):
        """ Run a single training epoch """
        start_time = time.time()

        # initialize
        losses = []
        recalls = []
        mrrs = []
        ##增加###
        zippers = []
        ##增加###
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0

            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        # 一个bach一个bach的迭代,每次迭代是一个input:tensor([ 31,  26,  27,  29,  24]);一个output:tensor([ 31,  26,  28,  17,  24])
        #
        if training == True:
            n_items = len(dataset.items)
            # sampling 增加额外负样本采样
            if self.n_sample > 0:
                pop = dataset.df.groupby(
                    'ItemId').size()  # item的流行度supp,数据如下格式
                # ItemId
                # 214507331  1
                # 214507365  1
                # 将sample_alpha设置为1会导致基于流行度的采样，将其设置为0会导致均匀采样
                pop = pop[
                    dataset.itemmap[dataset.item_key].
                    values].values**self.sample_alpha  # item选择作为样本的概率为supp ^ sample_alpha
                pop = pop.cumsum() / pop.sum()
                pop[-1] = 1
                if self.sample_store:
                    generate_length = self.sample_store // self.n_sample
                    if generate_length <= 1:
                        sample_store = 0
                        print('No example store was used')
                    else:
                        neg_samples = self.generate_neg_samples(
                            n_items, pop, generate_length)
                        sample_pointer = 0
                else:
                    print('No example store was used')

        for input, target, mask in loader:
            input = input.to(device)
            target = target.to(device)
            # print(input)
            # print(target)
            #额外的 SAMPLING THE OUTPUT
            if self.n_sample > 0 and training:
                if self.sample_store:
                    if sample_pointer == generate_length:
                        neg_samples = self.generate_neg_samples(
                            n_items, pop, generate_length)
                        sample_pointer = 0
                    sample = neg_samples[sample_pointer]
                    sample_pointer += 1
                else:
                    sample = self.generate_neg_samples(pop, 1)
                y = torch.LongTensor(np.hstack([target, sample]))
            else:
                y = target  #不增加额外采样
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling   #理解,很重要！！！！！！！
            y = y.to(device)
            logit_sampled = logit[:, y]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad(
                )  # flush the gradient after the optimization

        results = dict()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)

        end_time = time.time()
        results['time'] = (end_time - start_time) / 60

        if not training:
            self.gru.train()

        return results

    def train(self,
              dataset,
              k=20,
              n_epochs=10,
              save_dir='./models',
              save=True,
              model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')

        for epoch in range(n_epochs):
            results = self.run_epoch(dataset, k=k, training=True)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            print(f'epoch:{epoch+1:2d}/{"/".join(results)}')

            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir / model_fname)

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(f'Test result: {"/".join(results)}')

Пример #7

Показать файл

class GRU4REC:
    def __init__(self, input_size, hidden_size, output_size, num_layers=1,
                 optimizer_type='Adagrad', lr=.05, weight_decay=0,
                 momentum=0, eps=1e-6, loss_type='TOP1',
                 clip_grad=-1, dropout_input=.0, dropout_hidden=.5,
                 batch_size=50, use_cuda=True, time_sort=False, pretrained=None):

        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        if pretrained is None:
            self.gru = GRU(input_size, hidden_size, output_size, num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           use_cuda=use_cuda,
                           batch_size=batch_size)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps
        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def train(self, df, session_key, time_key, item_key, n_epochs=10, save_dir='./models', model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            df (pd.DataFrame): training dataset
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        df, click_offsets, session_idx_arr = GRU4REC.init_data(df, session_key, time_key, item_key,
                                                               time_sort=self.time_sort)
        # Time the training process
        start_time = time.time()
        for epoch in range(n_epochs):
            loss = self.run_epoch(df, click_offsets, session_idx_arr)
            end_time = time.time()
            wall_clock = (end_time - start_time) / 60
            print(f'Epoch:{epoch+1:2d}/Loss:{loss:0.3f}/TrainingTime:{wall_clock:0.3f}(min)')
            start_time = time.time()

            # Store the intermediate model
            save_dir = Path(save_dir)
            if not save_dir.exists(): save_dir.mkdir()

            model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
            torch.save(self.gru.state_dict(), save_dir/model_fname)

    def run_epoch(self, df, click_offsets, session_idx_arr):
        """ Runs a single training epoch """
        mb_losses = []
        # initializations
        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters]+1]
        # initialize the hidden state
        hidden = self.gru.init_hidden().data

        optimizer = self.optimizer

        # Start the training loop
        finished = False
        while not finished:
            minlen = (end-start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs, targets, and hidden states
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor
                target = Variable(torch.LongTensor(idx_target)) #(B)
                if self.use_cuda:
                    input = input.cuda()
                    target = target.cuda()
                # Now, convert into an embedded Variable
                embedded = self.gru.emb(input)
                hidden = Variable(hidden)

                # Go through the GRU layer
                logit, hidden = self.gru(embedded, target, hidden)

                # Calculate the mini-batch loss
                mb_loss = self.loss_fn(logit)
                mb_losses.append(mb_loss.data[0])

                # flush the gradient b/f backprop
                optimizer.zero_grad()

                # Backprop
                mb_loss.backward()

                # Gradient Clipping(Optional)
                if self.clip_grad != -1:
                    for p in self.gru.parameters():
                        p.grad.data.clamp_(max=self.clip_grad)

                # Mini-batch GD
                optimizer.step()

                # Detach the hidden state for later reuse
                hidden = hidden.data

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # figure out how many sessions should terminate
            mask = np.arange(len(iters))[(end-start)<=1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets)-1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter]+1]

            # reset the rnn hidden state to zero after transition
            if len(mask) != 0:
                hidden[:, mask, :] = 0

        avg_epoch_loss = np.mean(mb_losses)

        return avg_epoch_loss

    def predict(self, input, target, hidden):
        """ Forward propagation for testing

        Args:
            input (B,C): torch.LongTensor. The one-hot embedding for the item indices
            target (B,): a Variable that stores the indices for the next items
            hidden: previous hidden state

        Returns:
            logits (B,C): logits for the next items
            hidden: next hidden state
        """
        # convert the item indices into embeddings
        embedded = self.gru.emb(input, volatile=True)
        hidden = Variable(hidden, volatile=True)
        # forward propagation
        logits, hidden = self.gru(embedded, target, hidden)

        return logits, hidden

    def test(self, df_train, df_test, session_key, time_key, item_key,
             k=20, batch_size=50):
        """ Model evaluation

        Args:
            df_train (pd.DataFrame): training set required to retrieve the training item indices.
            df_test (pd.DataFrame): test set
            session_key (str): session ID
            time_key (str): time ID
            item_key (str): item ID
            k (int): the length of the recommendation list
            batch_size (int): testing batch_size

        Returns:
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
        """
        # set the gru layer into inference mode
        if self.gru.training:
            self.gru.switch_mode()

        recalls = []
        mrrs = []

        # initializations
        # Build item2idx from train data.
        iids = df_train[item_key].unique() # unique item ids
        item2idx = pd.Series(data=np.arange(len(iids)), index=iids)
        df_test = pd.merge(df_test,
                           pd.DataFrame({item_key: iids,
                                         'item_idx': item2idx[iids].values}),
                           on=item_key,
                           how='inner')
        # Sort the df by time, and then by session ID.
        df_test.sort_values([session_key, time_key], inplace=True)
        # Return the offsets of the beginning clicks of each session IDs
        click_offsets = GRU4REC.get_click_offsets(df_test, session_key)
        session_idx_arr = GRU4REC.order_session_idx(df_test, session_key, time_key, time_sort=self.time_sort)

        iters = np.arange(batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters]+1]
        hidden = self.gru.init_hidden().data

        # Start the training loop
        finished = False
        while not finished:
            minlen = (end-start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df_test.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs, targets, and hidden states
                idx_input = idx_target
                idx_target = df_test.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input) #(B) At first, input is a Tensor
                target = Variable(torch.LongTensor(idx_target), volatile=True)  # (B)
                if self.use_cuda:
                    input = input.cuda()
                    target = target.cuda()

                logit, hidden = self.predict(input, target, hidden)
                recall, mrr = evaluate(logit, target, k)
                recalls.append(recall)
                mrrs.append(mrr)

                # Detach the hidden state for later reuse
                hidden = hidden.data

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end-start)<=1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets)-1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter]+1]

            # reset the rnn hidden state to zero after transition
            if len(mask)!= 0:
                hidden[:,mask,:] = 0

        avg_recall = np.mean(recalls)
        avg_mrr = np.mean(mrrs)

        # reset the gru to a training mode
        self.gru.switch_mode()

        return avg_recall, avg_mrr

    @staticmethod
    def init_data(df, session_key, time_key, item_key, time_sort):
        """
        Initialize the data.
        """

        # add item indices to the dataframe
        df = GRU4REC.add_item_indices(df, item_key)

        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and 
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """
        df.sort_values([session_key, time_key], inplace=True)

        click_offsets = GRU4REC.get_click_offsets(df, session_key)
        session_idx_arr = GRU4REC.order_session_idx(df, session_key, time_key, time_sort=time_sort)

        return df, click_offsets, session_idx_arr

    @staticmethod
    def add_item_indices(df, item_key):
        """
        Adds an item index column named "item_idx" to the df.

        Args:
            df: pd.DataFrame to add the item indices to

        Returns:
            df: copy of the original df with item indices
        """
        iids = df[item_key].unique() # unique item ids
        item2idx = pd.Series(data=np.arange(len(iids)), index=iids)
        df = pd.merge(df,
                      pd.DataFrame({item_key: iids,
                                    'item_idx': item2idx[iids].values}),
                      on=item_key,
                      how='inner')
        return df

    @staticmethod
    def get_click_offsets(df, session_key):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """

        offsets = np.zeros(df[session_key].nunique()+1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = df.groupby(session_key).size().cumsum()

        return offsets

    @staticmethod
    def order_session_idx(df, session_key, time_key, time_sort=False):
        """ Order the session indices """

        if time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = df.groupby(session_key)[time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(df[session_key].nunique())

        return session_idx_arr

Пример #8

Показать файл

    def __init__(self,
                 input_size,
                 if_embedding,
                 embedding_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 cuda_id=1,
                 compress=False,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.if_embedding = if_embedding
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.cuda_id = cuda_id
        self.device = torch.device(
            'cuda:%d' % cuda_id if use_cuda else 'cpu'
        )  # must specify cuda_id or it will be torch.cuda.current_device()
        print(self.device)
        if pretrained is None:
            self.gru = GRU(input_size,
                           if_embedding,
                           embedding_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda,
                           cuda_id=cuda_id)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps

        self.compress = compress
        self.compression_scheduler = None
        if self.compress:
            # Create a CompressionScheduler and configure it from a YAML schedule file
            source = self.compress
            self.compression_scheduler = distiller.config.file_config(
                self.gru, None, self.compress)

        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

Пример #9

Показать файл

class GRU4REC:
    def __init__(self,
                 input_size,
                 if_embedding,
                 embedding_size,
                 hidden_size,
                 output_size,
                 num_layers=1,
                 optimizer_type='Adagrad',
                 lr=.01,
                 weight_decay=0,
                 momentum=0,
                 eps=1e-6,
                 loss_type='TOP1',
                 clip_grad=-1,
                 dropout_input=.0,
                 dropout_hidden=.5,
                 batch_size=50,
                 use_cuda=True,
                 cuda_id=1,
                 compress=False,
                 time_sort=False,
                 pretrained=None):
        """ The GRU4REC model

        Args:
            input_size (int): dimension of the gru input variables
            hidden_size (int): dimension of the gru hidden units
            output_size (int): dimension of the gru output variables
            num_layers (int): the number of layers in the GRU
            optimizer_type (str): optimizer type for GRU weights
            lr (float): learning rate for the optimizer
            weight_decay (float): weight decay for the optimizer
            momentum (float): momentum for the optimizer
            eps (float): eps for the optimizer
            loss_type (str): type of the loss function to use
            clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1
            dropout_input (float): dropout probability for the input layer
            dropout_hidden (float): dropout probability for the hidden layer
            batch_size (int): mini-batch size
            use_cuda (bool): whether you want to use cuda or not
            time_sort (bool): whether to ensure the the order of sessions is chronological (default: False)
            pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None)
        """

        # Initialize the GRU Layer
        self.input_size = input_size
        self.if_embedding = if_embedding
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.cuda_id = cuda_id
        self.device = torch.device(
            'cuda:%d' % cuda_id if use_cuda else 'cpu'
        )  # must specify cuda_id or it will be torch.cuda.current_device()
        print(self.device)
        if pretrained is None:
            self.gru = GRU(input_size,
                           if_embedding,
                           embedding_size,
                           hidden_size,
                           output_size,
                           num_layers,
                           dropout_input=dropout_input,
                           dropout_hidden=dropout_hidden,
                           batch_size=batch_size,
                           use_cuda=use_cuda,
                           cuda_id=cuda_id)
        else:
            self.gru = pretrained

        # Initialize the optimizer
        self.optimizer_type = optimizer_type
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.lr = lr
        self.eps = eps

        self.compress = compress
        self.compression_scheduler = None
        if self.compress:
            # Create a CompressionScheduler and configure it from a YAML schedule file
            source = self.compress
            self.compression_scheduler = distiller.config.file_config(
                self.gru, None, self.compress)

        self.optimizer = Optimizer(self.gru.parameters(),
                                   optimizer_type=optimizer_type,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   momentum=momentum,
                                   eps=eps)

        # Initialize the loss function
        self.loss_type = loss_type
        self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id)

        # gradient clipping(optional)
        self.clip_grad = clip_grad

        # etc
        self.time_sort = time_sort

    def run_epoch(self,
                  dataset,
                  k=20,
                  training=True,
                  compression_scheduler=None,
                  epoch=0):
        """ Run a single training epoch """
        start_time = time.time()

        # initialize
        losses = []
        recalls = []
        mrrs = []
        optimizer = self.optimizer
        hidden = self.gru.init_hidden()
        if not training:
            self.gru.eval()
        device = self.device

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0

            return hidden

        # Start the training loop
        loader = SessionDataLoader(dataset, batch_size=self.batch_size)
        batch_id = 0
        steps_per_epoch = math.ceil(len(dataset.items) / self.batch_size)

        for input, target, mask in loader:
            batch_id = batch_id + 1
            input = input.to(device)
            target = target.to(device)
            # reset the hidden states if some sessions have just terminated
            hidden = reset_hidden(hidden, mask).detach()
            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(
                    epoch,
                    minibatch_id=batch_id,
                    minibatches_per_epoch=steps_per_epoch)
            # Go through the GRU layer
            logit, hidden = self.gru(input, target, hidden)
            # Output sampling
            logit_sampled = logit[:, target.view(-1)]
            # Calculate the mini-batch loss
            loss = self.loss_fn(logit_sampled)
            with torch.no_grad():
                recall, mrr = E.evaluate(logit, target, k)
            losses.append(loss.item())
            recalls.append(recall)
            mrrs.append(mrr)
            # Gradient Clipping(Optional)
            if self.clip_grad != -1:
                for p in self.gru.parameters():
                    p.grad.data.clamp_(max=self.clip_grad)
            # Mini-batch GD
            if training:
                if compression_scheduler:
                    # Before running the backward phase, we allow the scheduler to modify the loss
                    # (e.g. add regularization loss)
                    loss = compression_scheduler.before_backward_pass(
                        epoch,
                        minibatch_id=batch_id,
                        minibatches_per_epoch=steps_per_epoch,
                        loss=loss,
                        return_loss_components=False)
                # Backprop
                loss.backward()
                optimizer.step()
                optimizer.zero_grad(
                )  # flush the gradient after the optimization
            if compression_scheduler:
                compression_scheduler.on_minibatch_end(
                    epoch,
                    minibatch_id=batch_id,
                    minibatches_per_epoch=steps_per_epoch)

        results = dict()
        #ipdb.set_trace()
        results['loss'] = np.mean(losses)
        results['recall'] = np.mean(recalls)
        results['mrr'] = np.mean(mrrs)

        end_time = time.time()
        results['time'] = (end_time - start_time) / 60

        if not training:
            self.gru.train()

        return results

    def train(self,
              dataset,
              k=20,
              n_epochs=20,
              save_dir='./models',
              save=True,
              model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')
        for epoch in range(n_epochs):
            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_begin(epoch)
            results = self.run_epoch(
                dataset,
                k=k,
                training=True,
                compression_scheduler=self.compression_scheduler,
                epoch=epoch)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            print(f'epoch:{epoch+1:2d}/{"/".join(results)}')

            t, total = distiller.weights_sparsity_tbl_summary(
                self.gru, return_total_sparsity=True)
            print("\nParameters:\n" + str(t))
            print('Total sparsity: {:0.2f}\n'.format(total))

            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_end(epoch)

            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir / model_fname)

    def test(self, dataset, k=20):
        """ Model evaluation

        Args:
            k (int): the length of the recommendation list

        Returns:
            avg_loss: mean of the losses over the session-parallel minibatches
            avg_recall: mean of the Recall@K over the session-parallel mini-batches
            avg_mrr: mean of the MRR@K over the session-parallel mini-batches
            wall_clock: time took for testing
        """
        results = self.run_epoch(dataset, k=k, training=False)
        results = [f'{k}:{v:.3f}' for k, v in results.items()]
        print(f'Test result: {"/".join(results)}')