示例#1
0
class USTAN:
    '''
    STAN( k,  sample_size=5000, sampling='recent', remind=True, extend=False, lambda_spw=1.02, lambda_snh=5, lambda_inh=2.05 , session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time' )

    Parameters
    -----------
    k : int
        Number of neighboring session to calculate the item scores from. (Default value: 100)
    sample_size : int
        Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
    sampling : string
        String to define the sampling method for sessions (recent, random). (default: recent)
    remind : string
        String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
    extend : string
        Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div)
    lambda_spw : string
        Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score)
    lambda_snh : boolean
        Experimental function to give less weight to items from older sessions (default: False)
    lambda_inh : boolean
        Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False)
    session_key : string
        Header of the session ID column in the input file. (default: 'SessionId')
    item_key : string
        Header of the item ID column in the input file. (default: 'ItemId')
    time_key : string
        Header of the timestamp column in the input file. (default: 'Time')
    '''

    def __init__( self, k, sample_size=5000, sampling='recent', remind=True, extend=False, lambda_spw=1.02, lambda_snh=5, lambda_inh=2.05 ,
                  extend_session_length=None, extending_mode='lastViewed', refine_mode=True, boost_own_sessions=None,
                  reminders=False, remind_strategy='recency', remind_sessions_num=6, reminders_num=3, remind_mode='end', weight_base=1, weight_IRec=0, weight_SSim=0,
                  session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time', user_key='UserId'):
       
        self.k = k
        self.sample_size = sample_size
        self.sampling = sampling
        
        self.lambda_spw = lambda_spw
        self.lambda_snh = lambda_snh * 24 * 3600
        self.lambda_inh = lambda_inh
        
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.user_key = user_key  # user_based

        self.extend = extend
        self.remind = remind
        self.extending_mode = extending_mode

        # user_based
        self.extend_session_length = extend_session_length
        self.refine_mode = refine_mode
        self.boost_own_sessions = boost_own_sessions

        #updated while recommending
        self.session = -1
        self.session_items = []
        self.relevant_sessions = set()
        # user_based
        self.items_previous = []
        self.last_user_items = {}  # to extend the session model
        self.recent_user_items = {}  # to remind
        self.recent_user_sessions = {}  # to remind
        self.user_item_intensity = dict()  # to remind (for 'session_similarity')
        # reminders
        self.hasReminders = reminders
        if self.hasReminders:
            if remind_strategy == 'hybrid':
                self.reminder = Reminder(remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num,
                                         weight_base=weight_base, weight_IRec=weight_IRec, weight_SSim=weight_SSim)
            else:  # basic reminders
                self.reminder = Reminder(remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num,
                                         reminders_num=reminders_num, remind_mode=remind_mode)

        # cache relations once at startup
        self.session_item_map = dict() 
        self.item_session_map = dict()
        self.session_time = dict()
        self.min_time = -1
        self.session_user_map = dict()  # user_based

        self.sim_time = 0

    def fit(self, train, test=None, items=None):
        '''
        Trains the predictor.
        
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
            
        '''            
        self.num_items = train[self.item_key].max()
        
        index_session = train.columns.get_loc( self.session_key )
        index_item = train.columns.get_loc( self.item_key )
        index_time = train.columns.get_loc( self.time_key )
        index_user = train.columns.get_loc(self.user_key)  # user_based

        session = -1
        session_items = []
        time = -1
        user = -1  # user_based
        #cnt = 0
        for row in train.itertuples(index=False):
            # cache items of sessions
            if row[index_session] != session:
                if len(session_items) > 0:
                    self.session_item_map.update({session : session_items})
                    # cache the last time stamp of the session
                    self.session_time.update({session : time})
                    self.session_user_map.update({session: user})  # user_based
                    if time < self.min_time:
                        self.min_time = time
                user = row[index_user]  # user_based
                session = row[index_session]
                session_items = []
            time = row[index_time]
            session_items.append(row[index_item])
            
            # cache sessions involving an item
            map_is = self.item_session_map.get( row[index_item] )
            if map_is is None:
                map_is = set()
                self.item_session_map.update({row[index_item] : map_is})
            map_is.add(row[index_session])

            # add last viewed items (by the user) to the last_user_items dictionary
            if self.extend_session_length is not None:  # user_based
                self.extend_session_in_fit(row, index_user, index_item)

            # reminders
            if self.hasReminders:  # user_based  # for 'session_similarity' or 'recency'
                self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item)

        # reminders # save item_intensity in the last N session for each user
        if self.hasReminders:  # user_based
            self.reminder.reminders_fit(train, self.user_key, self.item_key, self.time_key)

        # Add the last tuple
        self.session_item_map.update({session : session_items})
        self.session_time.update({session : time})
        self.session_user_map.update({session: user})  # user_based

        
        if self.sample_size == 0: #use all session as possible neighbors
            print('!!!!! runnig KNN without a sample size (check config)')
        
    def predict_next( self, session_id, input_item_id, input_user_id, predict_for_item_ids=None, timestamp=0, skip=False, type='view'):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
                
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event. Must be in the set of item IDs of the training set.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
            
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        
        '''
        
#         gc.collect()
#         process = psutil.Process(os.getpid())
#         print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
        
        if( self.session != session_id ): #new session
            
            if( self.extend ):
                self.session_item_map[self.session] = self.session_items;
                for item in self.session_items:
                    map_is = self.item_session_map.get( item )
                    if map_is is None:
                        map_is = set()
                        self.item_session_map.update({item : map_is})
                    map_is.add(self.session)
                    
                ts = time.time()
                self.session_time.update({self.session : ts})
                self.session_user_map.update({self.session: input_user_id})  # user_based

            self.session = session_id
            self.session_items = list()
            self.relevant_sessions = set()
            self.items_previous = []  # user_based
            self.need_refine = True  # user_based

        if type == 'view':
            self.session_items.append( input_item_id )
        
        if skip:
            return

        items = self.session_items

        # we add extra items form the user profile as long as the session is not long enough!
        if self.extend_session_length is not None and input_user_id in self.last_user_items:  # user_based
            items = self.extend_session_in_predict_next(items, input_user_id)

        neighbors = self.find_neighbors( items, input_item_id, session_id, timestamp, input_user_id)
        scores = self.score_items( neighbors, items, timestamp )
        
        # Create things in the format ..
        predictions = np.zeros(len(predict_for_item_ids))
        mask = np.in1d( predict_for_item_ids, list(scores.keys()) )
        
        items = predict_for_item_ids[mask]
        values = [scores[x] for x in items]
        predictions[mask] = values
        series = pd.Series(data=predictions, index=predict_for_item_ids)

        if self.hasReminders:  # user_based
            if self.reminder.remind_strategy == 'hybrid':
                if self.reminder.w_SSim == 0:
                    series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key,
                                                                      self.time_key, input_timestamp=timestamp)
                else:
                    past_user_sessions = self.calc_similarity(items, self.reminder.recent_user_sessions[input_user_id], timestamp, input_user_id)
                    series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key,
                                                                  past_user_sessions=past_user_sessions, session_item_map=self.session_item_map, input_timestamp=timestamp)
            else:  # basic reminders
                if self.reminder.remind_strategy == 'session_similarity':
                    past_user_sessions = self.calc_similarity(items, self.reminder.recent_user_sessions[input_user_id], timestamp, input_user_id)
                    series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key,
                                                                  past_user_sessions=past_user_sessions, session_item_map=self.session_item_map)
                if self.reminder.remind_strategy == 'recency':
                    series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key)

        return series 
    
    def vec(self, current, neighbor, pos_map):
        '''
        Calculates the ? for 2 sessions
        
        Parameters
        --------
        first: Id of a session
        second: Id of a session
        
        Returns 
        --------
        out : float value           
        '''
        intersection = current & neighbor
        vp_sum = 0
        for i in intersection:
            vp_sum += pos_map[i]
        
        result = vp_sum / len(pos_map)

        return result
    
    def cosine(self, current, neighbor, pos_map):
        '''
        Calculates the cosine similarity for two sessions
        
        Parameters
        --------
        first: Id of a session
        second: Id of a session
        
        Returns 
        --------
        out : float value           
        '''
                
        lneighbor = len(neighbor)
        intersection = current & neighbor
        
        if pos_map is not None:
            
            vp_sum = 0
            current_sum = 0
            for i in current:
                current_sum += pos_map[i] * pos_map[i]
                if i in intersection:
                    vp_sum += pos_map[i]
        else:
            vp_sum = len( intersection )
            current_sum = len( current )
                
        result = vp_sum / (sqrt(current_sum) * sqrt(lneighbor))
        
        return result
    
    
    def items_for_session(self, session):
        '''
        Returns all items in the session
        
        Parameters
        --------
        session: Id of a session
        
        Returns 
        --------
        out : set           
        '''
        return self.session_item_map.get(session);
    
    def sessions_for_item(self, item_id):
        '''
        Returns all session for an item
        
        Parameters
        --------
        item: Id of the item session
        
        Returns 
        --------
        out : set           
        '''
        return self.item_session_map.get( item_id ) if item_id in self.item_session_map else set()
        
        
    def most_recent_sessions( self, sessions, number ):
        '''
        Find the most recent sessions in the given set
        
        Parameters
        --------
        sessions: set of session ids
        
        Returns 
        --------
        out : set           
        '''
        sample = set()

        tuples = list()
        for session in sessions:
            time = self.session_time.get( session )
            if time is None:
                print(' EMPTY TIMESTAMP!! ', session)
            tuples.append((session, time))
            
        tuples = sorted(tuples, key=itemgetter(1), reverse=True)
        #print 'sorted list ', sortedList
        cnt = 0
        for element in tuples:
            cnt = cnt + 1
            if cnt > number:
                break
            sample.add( element[0] )
        #print 'returning sample of size ', len(sample)
        return sample


    #-----------------
    # Find a set of neighbors, returns a list of tuples (sessionid: similarity) 
    #-----------------
    def find_neighbors( self, session_items, input_item_id, session_id, timestamp, user_id):
        '''
        Finds the k nearest neighbors for the given session_id and the current item input_item_id. 
        
        Parameters
        --------
        session_items: set of item ids
        input_item_id: int 
        session_id: int
        
        Returns 
        --------
        out : list of tuple (session_id, similarity)           
        '''
        possible_neighbors = self.possible_neighbor_sessions( session_items, input_item_id, session_id, user_id)
        possible_neighbors = self.calc_similarity( session_items, possible_neighbors, timestamp, user_id)
        
        possible_neighbors = sorted( possible_neighbors, reverse=True, key=lambda x: x[1] )
        possible_neighbors = possible_neighbors[:self.k]
        
        return possible_neighbors
    
    
    def possible_neighbor_sessions(self, session_items, input_item_id, session_id, user_id):
        '''
        Find a set of session to later on find neighbors in.
        A self.sample_size of 0 uses all sessions in which any item of the current session appears.
        self.sampling can be performed with the options "recent" or "random".
        "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. 
        
        Parameters
        --------
        sessions: set of session ids
        
        Returns 
        --------
        out : set           
        '''
        
        self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( input_item_id )
               
        if self.sample_size == 0: #use all session as possible neighbors
            
            #print('!!!!! runnig KNN without a sample size (check config)')
            return self.relevant_sessions

        else: #sample some sessions
                         
            if len(self.relevant_sessions) > self.sample_size:
                
                if self.sampling == 'recent':
                    sample = self.most_recent_sessions( self.relevant_sessions, self.sample_size )
                elif self.sampling == 'random':
                    sample = random.sample( self.relevant_sessions, self.sample_size )
                else:
                    sample = self.relevant_sessions[:self.sample_size]
                    
                return sample
            else: 
                return self.relevant_sessions
                        

    def calc_similarity(self, session_items, sessions, timestamp, user_id):
        '''
        Calculates the configured similarity for the items in session_items and each session in sessions.
        
        Parameters
        --------
        session_items: set of item ids
        sessions: list of session ids
        
        Returns 
        --------
        out : list of tuple (session_id,similarity)           
        '''
        
        pos_map = None
        if self.lambda_spw:
            pos_map = {}
        length = len( session_items )
        
        pos = 1
        for item in session_items:
            if self.lambda_spw is not None: 
                pos_map[item] = self.session_pos_weight( pos, length, self.lambda_spw )
                pos += 1
            
        #print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
        items = set(session_items)
        neighbors = []
        cnt = 0
        for session in sessions:
            cnt = cnt + 1
            # get items of the session, look up the cache first 
            n_items = self.items_for_session( session )

            similarity = self.cosine(items, set(n_items), pos_map) 
                            
            if self.lambda_snh is not None:
                sts = self.session_time[session]
                decay = self.session_time_weight(timestamp, sts, self.lambda_snh)
                
                similarity *= decay

            if self.boost_own_sessions is not None:  # user_based
                similarity = self.apply_boost(session, user_id, similarity)

            neighbors.append((session, similarity))
                
        return neighbors
    
    def session_pos_weight(self, position, length, lambda_spw):
        diff = position - length
        return exp( diff / lambda_spw )
    
    def session_time_weight(self, ts_current, ts_neighbor, lambda_snh):
        diff = ts_current - ts_neighbor
        return exp( - diff / lambda_snh )
            
    def score_items(self, neighbors, current_session, timestamp):
        '''
        Compute a set of scores for all items given a set of neighbors.
        
        Parameters
        --------
        neighbors: set of session ids
        
        Returns 
        --------
        out : list of tuple (item, score)           
        '''
        # now we have the set of relevant items to make predictions
        scores = dict()
        s_items = set( current_session )
        # iterate over the sessions
        for session in neighbors:
            # get the items in this session
            n_items = self.items_for_session( session[0] )
            
            pos_last = {}
            pos_i_star = None
            for i in range( len( n_items ) ):
                if n_items[i] in s_items: 
                    pos_i_star = i + 1
                pos_last[n_items[i]] = i + 1
            
            n_items = set( n_items )
            
            for item in n_items:
                
                if not self.remind and item in s_items:
                    continue
                
                old_score = scores.get( item )
                
                new_score = session[1]
                
                if self.lambda_inh is not None: 
                    new_score = new_score * self.item_pos_weight( pos_last[item], pos_i_star, self.lambda_inh )
                
                if not old_score is None:
                    new_score = old_score + new_score
                    
                scores.update({item : new_score})
                    
        return scores
    
    def item_pos_weight(self, pos_candidate, pos_item, lambda_inh):
        diff = abs( pos_candidate - pos_item )
        return exp( - diff / lambda_inh )
    
    def clear(self):
        self.session = -1
        self.session_items = []
        self.relevant_sessions = set()

        self.session_item_map = dict() 
        self.item_session_map = dict()
        self.session_time = dict()
        self.session_user_map = dict()  # user_based

    def support_users(self):
        '''
          whether it is a session-based or session-aware algorithm
          (if returns True, method "predict_with_training_data" must be defined as well)

          Parameters
          --------

          Returns
          --------
          True : if it is session-aware
          False : if it is session-based
        '''
        return True

    def predict_with_training_data(self):
        '''
            (this method must be defined if "support_users is True")
            whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions)

            Parameters
            --------

            Returns
            --------
            True : e.g. hgru4rec
            False : e.g. uvsknn
            '''
        return False

    def extend_session_in_fit(self, row, index_user, index_item):
        if not row[index_user] in self.last_user_items:
            # create a new list to save the user's last viewed items
            self.last_user_items[row[index_user]] = []
        self.last_user_items[row[index_user]].append(row[index_item])
        if len(self.last_user_items[row[index_user]]) > self.extend_session_length:
            self.last_user_items[row[index_user]] = self.last_user_items[row[index_user]][
                                                    -self.extend_session_length:]

    def extend_session_in_predict_next(self, items, input_user_id):
        if len(items) < self.extend_session_length:
            # update the session with items from the users past
            n = len(self.session_items)
            addItems = self.extend_session_length - n
            prev_items = self.last_user_items[input_user_id][-addItems:]
            items = prev_items + self.session_items

            # if it is beginning of the session => find relevant sessions for added items
            if len(self.items_previous) == 0:
                for item in set(prev_items):
                    self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item)
            # not beginning of the session, so we already retrieved neighbours for the extended session
            elif self.refine_mode:
                # if the first item that was in the previous step, is not in the current step anymore => refine the self.relevant_sessions
                if not self.items_previous[0] in items:
                    self.relevant_sessions = set()
                    for item in set(items):
                        self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item)

            # update self.items_previous
            self.items_previous = items
        # the session is long enough => refine the self.relevant_sessions to just consider current session's items
        elif self.refine_mode and self.need_refine:
            self.relevant_sessions = set()
            for item in set(self.session_items):
                # then we can continue with just adding related sessions for the current item
                self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item)
            # refined once after reach to the defined length, no need to do refine anymore
            self.need_refine = False

        return items

    def apply_boost(self, session, user_id, similarity):
        if self.boost_own_sessions > 0.0 and self.session_user_map[session] == user_id:
            similarity = similarity + (similarity * self.boost_own_sessions)
        return similarity
示例#2
0
class UNARM:
    '''
    Code based on work by Li et al., Neural Attentive Session-based Recommendation, CIKM 2017.

    NARM(factors=100, session_key='SessionId', item_key='ItemId')
    
    Popularity predictor that gives higher scores to items with larger support.
    
    The score is given by:
    
    .. math::
        r_{i}=\\frac{supp_i}{(1+supp_i)}
        
    Parameters
    --------
    top_n : int
        Only give back non-zero scores to the top N ranking items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
    item_key : string
        The header of the item IDs in the training data. (Default value: 'ItemId')
    support_by_key : string or None
        If not None, count the number of unique values of the attribute of the training data given by the specified header. If None, count the events. (Default value: None)
    
    '''
    def __init__(self,
                 factors=100,
                 hidden_units=100,
                 epochs=30,
                 lr=0.001,
                 extend_session_length=None,
                 reminders=False,
                 remind_strategy='recency',
                 remind_sessions_num=6,
                 reminders_num=3,
                 remind_mode='end',
                 weight_base=1,
                 weight_IRec=0,
                 session_key='SessionId',
                 item_key='ItemId',
                 time_key='Time',
                 user_key='UserId'):
        self.factors = factors
        self.hidden_units = hidden_units
        self.factors = factors
        self.epochs = epochs
        self.lr = lr

        self.session_key = session_key
        self.item_key = item_key

        self.session = -1
        self.session_items = list()

        self.floatX = theano.config.floatX

        # user_based
        self.time_key = time_key
        self.user_key = user_key
        self.extend_session_length = extend_session_length
        self.last_user_items = {}
        self.recent_user_items = {}
        self.recent_user_sessions = {}
        self.hasReminders = reminders
        self.hasReminders = reminders
        if self.hasReminders:
            if remind_strategy == 'hybrid':
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    weight_base=weight_base,
                    weight_IRec=weight_IRec)
            else:  # basic reminders
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    reminders_num=reminders_num,
                    remind_mode=remind_mode)

    def fit(self, data, test=None):
        '''
        Trains the predictor.
        
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
            
        '''

        nis = data[self.item_key].nunique()

        self.itemmap = pd.Series(index=data[self.item_key].unique(),
                                 data=range(1, nis + 1))
        data = data.merge(self.itemmap.to_frame('ItemIdx'),
                          how='inner',
                          right_index=True,
                          left_on=self.item_key)
        data.sort_values(['SessionId', 'Time'], inplace=True)

        self.traindata = self.create_training_data(data)
        self.dataload = (self.load_data, self.prepare_data)
        self.layers = {'gru': (self.param_init_gru, self.gru_layer)}

        self.train_gru(self.factors,
                       self.hidden_units,
                       max_epochs=self.epochs,
                       lrate=self.lr,
                       n_items=nis + 1)

    def train_gru(
            self,
            dim_proj=50,  # embeding dimension
            hidden_units=100,  # GRU number of hidden units.
            patience=5,  # Number of epoch to wait before early stop if no progress
            max_epochs=30,  # The maximum number of epoch to run
            dispFreq=10000,  # Display to stdout the training progress every N updates
            lrate=0.001,  # Learning rate
            n_items=37484,  # Vocabulary size
            encoder='gru',  # TODO: can be removed must be gru.
            saveto='gru_model.npz',  # The best model will be saved there
            is_valid=True,  # Compute the validation error after this number of update.
            is_save=False,  # Save the parameters after every saveFreq updates
            batch_size=512,  # The batch size during training.
            valid_batch_size=512,  # The batch size used for validation/test set.
            # Parameter for extra option
        use_dropout=True,  # if False slightly faster, but worst test error
            # This frequently need a bigger model.
        reload_model=None,  # Path to a saved model we want to start from.
            test_size=-1,  # If >0, we keep only this number of test example.
    ):

        # Model options
        model_options = locals().copy()
        print("model options", model_options)

        load_data, prepare_data = self.get_dataset()

        print('Loading data')
        train, valid = load_data()

        print('Building model')
        # This create the initial parameters as numpy ndarrays.
        # Dict name (string) -> numpy ndarray
        params = self.init_params(model_options)

        if reload_model:
            self.load_params('gru_model.npz', params)

        # This create Theano Shared Variable from the parameters.
        # Dict name (string) -> Theano Tensor Shared Variable
        # params and tparams have different copy of the weights.
        tparams = self.init_tparams(params)

        # use_noise is for dropout
        (use_noise, x, mask, y, f_pred_prob,
         cost) = self.build_model(tparams, model_options)

        self.pred_function = f_pred_prob

        all_params = list(tparams.values())

        updates = self.adam(cost, all_params, lrate)

        train_function = theano.function(inputs=[x, mask, y],
                                         outputs=cost,
                                         updates=updates)

        print('Optimization')

        print("%d train examples" % len(train[0]))
        print("%d valid examples" % len(valid[0]))

        history_errs = []
        history_vali = []
        best_p = None
        bad_count = 0

        uidx = 0  # the number of update done
        estop = False  # early stop

        try:
            for eidx in range(max_epochs):
                start_time = time.time()
                n_samples = 0
                epoch_loss = []

                # Get new shuffled index for the training set.
                kf = self.get_minibatches_idx(len(train[0]),
                                              batch_size,
                                              shuffle=True)
                kf_valid = self.get_minibatches_idx(len(valid[0]),
                                                    valid_batch_size,
                                                    shuffle=True)

                for _, train_index in kf:
                    uidx += 1
                    use_noise.set_value(1.)

                    # Select the random examples for this minibatch
                    y = [train[1][t] for t in train_index]
                    x = [train[0][t] for t in train_index]

                    # Get the data in numpy.ndarray format
                    # This swap the axis!
                    # Return something of shape (minibatch maxlen, n samples)
                    x, mask, y = prepare_data(x, y)
                    n_samples += x.shape[1]

                    loss = train_function(x, mask, y)
                    epoch_loss.append(loss)

                    if np.isnan(loss) or np.isinf(loss):
                        print('bad loss detected: ', loss)
                        return 1., 1., 1.

                    if np.mod(uidx, dispFreq) == 0:
                        print('Epoch ', eidx, 'Update ', uidx, 'Loss ',
                              np.mean(epoch_loss))

                if saveto and is_save:
                    print('Saving...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = self.unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    print('Saving done')

                if is_valid:
                    use_noise.set_value(0.)

                    valid_evaluation = self.pred_evaluation(
                        f_pred_prob, prepare_data, valid, kf_valid)
                    history_errs.append([valid_evaluation])

                    if best_p is None or valid_evaluation[1] >= np.array(
                            history_vali).max():

                        best_p = self.unzip(tparams)
                        print('Best perfomance updated!')
                        bad_count = 0

                    print('Valid Recall@20:', valid_evaluation[0],
                          '   Valid Mrr@20:', valid_evaluation[1])

                    if len(history_vali) > 10 and valid_evaluation[
                            1] <= np.array(history_vali).max():
                        bad_count += 1
                        print('===========================>Bad counter: ' +
                              str(bad_count))
                        print('current validation mrr: ' +
                              str(valid_evaluation[1]) +
                              '      history max mrr:' +
                              str(np.array(history_vali).max()))
                        if bad_count > patience:
                            print('Early Stop!')
                            estop = True

                    history_vali.append(valid_evaluation[1])

                end_time = time.time()
                print('Seen %d samples' % n_samples)
                print(('This epoch took %.1fs' % (end_time - start_time)),
                      file=sys.stderr)

                if estop:
                    break

        except KeyboardInterrupt:
            print("Training interupted")

        if best_p is not None:
            self.zipp(best_p, tparams)
        else:
            best_p = self.unzip(tparams)

        use_noise.set_value(0.)
        valid_evaluation = self.pred_evaluation(f_pred_prob, prepare_data,
                                                valid, kf_valid)

        print('=================Best performance=================')
        print('Valid Recall@20:', valid_evaluation[0], '   Valid Mrr@20:',
              valid_evaluation[1])
        print('==================================================')
        if saveto and is_save:
            np.savez('Best_performance',
                     valid_evaluation=valid_evaluation,
                     history_errs=history_errs,
                     **best_p)

        self.params = params
        self.tparams = tparams

        return valid_evaluation

    def create_training_data(self, data):

        index_session = data.columns.get_loc(self.session_key)
        index_item = data.columns.get_loc('ItemIdx')
        index_item_original = data.columns.get_loc('ItemId')
        index_user = data.columns.get_loc('UserId')

        out_seqs = []
        labs = []

        session = -1
        session_items = []

        for row in data.itertuples(index=False):

            # add last viewed items (by the user) to the last_user_items dictionary
            if self.extend_session_length is not None:  # user_based
                self.extend_session_model_in_loop(row, index_user,
                                                  index_item_original)

            # reminders
            if self.hasReminders:  # user_based  # for 'session_similarity' or 'recency'
                self.reminder.reminders_fit_in_loop(row, index_user,
                                                    index_session,
                                                    index_item_original)

            # cache items of sessions
            if row[index_session] != session:
                session = row[index_session]
                session_items = list()

            session_items.append(row[index_item])

            if len(session_items) > 1:
                out_seqs += [session_items[:-1]]
                labs += [session_items[-1]]

        # reminders
        if self.hasReminders:  # user_based
            self.reminder.reminders_fit(data, self.user_key, self.item_key,
                                        self.time_key)

        return out_seqs, labs

    def prepare_data(self, seqs, labels):
        """Create the matrices from the datasets.
        This pad each sequence to the same lenght: the lenght of the
        longuest sequence or maxlen.
        if maxlen is set, we will cut all sequence to this maximum
        lenght.
        This swap the axis!
        """
        # x: a list of sentences

        lengths = [len(s) for s in seqs]
        n_samples = len(seqs)
        maxlen = np.max(lengths)

        x = np.zeros((maxlen, n_samples)).astype('int64')
        x_mask = np.ones((maxlen, n_samples)).astype(self.floatX)
        for idx, s in enumerate(seqs):
            x[:lengths[idx], idx] = s

        x_mask *= (1 - (x == 0))

        return x, x_mask, labels

    def load_data(self, valid_portion=0.1, maxlen=19, sort_by_len=False):
        '''Loads the dataset
        :type path: String
        :param path: The path to the dataset (here RSC2015)
        :type n_items: int
        :param n_items: The number of items.
        :type valid_portion: float
        :param valid_portion: The proportion of the full train set used for
            the validation set.
        :type maxlen: None or positive int
        :param maxlen: the max sequence length we use in the train/valid set.
        :type sort_by_len: bool
        :name sort_by_len: Sort by the sequence lenght for the train,
            valid and test set. This allow faster execution as it cause
            less padding per minibatch. Another mechanism must be used to
            shuffle the train set at each epoch.
        '''

        #############
        # LOAD DATA #
        #############

        train_set = self.traindata

        if maxlen:
            new_train_set_x = []
            new_train_set_y = []
            for x, y in zip(train_set[0], train_set[1]):
                if len(x) < maxlen:
                    new_train_set_x.append(x)
                    new_train_set_y.append(y)
                else:
                    new_train_set_x.append(x[:maxlen])
                    new_train_set_y.append(y)
            train_set = (new_train_set_x, new_train_set_y)
            del new_train_set_x, new_train_set_y

        # split training set into validation set
        train_set_x, train_set_y = train_set
        n_samples = len(train_set_x)
        sidx = np.arange(n_samples, dtype='int32')
        np.random.shuffle(sidx)
        n_train = int(np.round(n_samples * (1. - valid_portion)))
        valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
        valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
        train_set_x = [train_set_x[s] for s in sidx[:n_train]]
        train_set_y = [train_set_y[s] for s in sidx[:n_train]]

        train_set = (train_set_x, train_set_y)
        valid_set = (valid_set_x, valid_set_y)

        valid_set_x, valid_set_y = valid_set
        train_set_x, train_set_y = train_set

        def len_argsort(seq):
            return sorted(range(len(seq)), key=lambda x: len(seq[x]))

        if sort_by_len:
            sorted_index = len_argsort(valid_set_x)
            valid_set_x = [valid_set_x[i] for i in sorted_index]
            valid_set_y = [valid_set_y[i] for i in sorted_index]

        train = (train_set_x, train_set_y)
        valid = (valid_set_x, valid_set_y)

        return train, valid

    def get_minibatches_idx(self, n, minibatch_size, shuffle=False):
        """
        Used to shuffle the dataset at each iteration.
        """

        idx_list = np.arange(n, dtype="int32")

        if shuffle:
            np.random.shuffle(idx_list)

        minibatches = []
        minibatch_start = 0
        for i in range(n // minibatch_size):
            minibatches.append(idx_list[minibatch_start:minibatch_start +
                                        minibatch_size])
            minibatch_start += minibatch_size

        if minibatch_start != n:
            # Make a minibatch out of what is left
            minibatches.append(idx_list[minibatch_start:])

        return zip(range(len(minibatches)), minibatches)

    def get_dataset(self):
        return self.dataload[0], self.dataload[1]

    def predict_next(self,
                     session_id,
                     input_item_id,
                     input_user_id,
                     predict_for_item_ids,
                     timestamp=0,
                     skip=False,
                     mode_type='view'):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
                
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
            
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        
        '''

        if (self.session != session_id):  #new session

            self.session = session_id
            self.session_items = list()

        if mode_type == 'view':
            self.session_items.append(input_item_id)

        if skip:
            return

        x = [self.itemmap[self.session_items].values]
        y = x

        x, mask, y = self.prepare_data(x, y)
        preds = self.pred_function(x, mask)

        series = pd.Series(data=preds[0][1:], index=self.itemmap.index)

        if self.hasReminders:  # user_based
            if self.reminder.remind_strategy == 'hybrid':
                series = self.reminder.reminders_predict_next(
                    input_user_id,
                    series,
                    self.item_key,
                    self.time_key,
                    input_timestamp=timestamp)
            else:  # basic reminders
                series = self.reminder.reminders_predict_next(
                    input_user_id, series, self.item_key, self.time_key)

        return series

    def zipp(self, params, tparams):
        """
        When we reload the model. Needed for the GPU stuff.
        """
        for kk, vv in params.items():
            tparams[kk].set_value(vv)

    def unzip(self, zipped):
        """
        When we pickle the model. Needed for the GPU stuff.
        """
        new_params = OrderedDict()
        for kk, vv in zipped.items():
            new_params[kk] = vv.get_value()
        return new_params

    def dropout_layer(self, state_before, use_noise, trng, drop_p=0.5):
        retain = 1. - drop_p
        proj = T.switch(use_noise, (state_before * trng.binomial(
            state_before.shape, p=retain, n=1, dtype=state_before.dtype)),
                        state_before * retain)
        return proj

    def _p(self, pp, name):
        return '%s_%s' % (pp, name)

    def init_params(self, options):
        """
        Global (not GRU) parameter. For the embeding and the classifier.
        """
        params = OrderedDict()
        # embedding
        params['Wemb'] = self.init_weights(
            (options['n_items'], options['dim_proj']))
        params = self.get_layer(options['encoder'])[0](
            options, params, prefix=options['encoder'])
        # attention
        params['W_encoder'] = self.init_weights(
            (options['hidden_units'], options['hidden_units']))
        params['W_decoder'] = self.init_weights(
            (options['hidden_units'], options['hidden_units']))
        params['bl_vector'] = self.init_weights((1, options['hidden_units']))
        # classifier
        # params['U'] = init_weights((2*options['hidden_units'], options['n_items']))
        # params['b'] = np.zeros((options['n_items'],)).astype(config.floatX)
        params['bili'] = self.init_weights(
            (options['dim_proj'], 2 * options['hidden_units']))

        return params

    def load_params(self, path, params):
        pp = np.load(path)
        for kk, vv in params.items():
            if kk not in pp:
                raise Warning('%s is not in the archive' % kk)
            params[kk] = pp[kk]

        return params

    def init_tparams(self, params):
        tparams = OrderedDict()
        for kk, pp in params.items():
            tparams[kk] = theano.shared(params[kk], name=kk)
        return tparams

    def get_layer(self, name):
        fns = self.layers[name]
        return fns

    def init_weights(self, shape):
        sigma = np.sqrt(2. / shape[0])
        return self.numpy_floatX(np.random.randn(*shape) * sigma)

    def ortho_weight(self, ndim):
        W = np.random.randn(ndim, ndim)
        u, s, v = np.linalg.svd(W)
        return u.astype(self.floatX)

    def param_init_gru(self, options, params, prefix='gru'):
        """
        Init the GRU parameter:
    
        :see: init_params
        """
        Wxrz = np.concatenate([
            self.init_weights((options['dim_proj'], options['hidden_units'])),
            self.init_weights((options['dim_proj'], options['hidden_units'])),
            self.init_weights((options['dim_proj'], options['hidden_units']))
        ],
                              axis=1)
        params[self._p(prefix, 'Wxrz')] = Wxrz

        Urz = np.concatenate([
            self.ortho_weight(options['hidden_units']),
            self.ortho_weight(options['hidden_units'])
        ],
                             axis=1)
        params[self._p(prefix, 'Urz')] = Urz

        Uh = self.ortho_weight(options['hidden_units'])
        params[self._p(prefix, 'Uh')] = Uh

        b = np.zeros((3 * options['hidden_units'], ))
        params[self._p(prefix, 'b')] = b.astype(self.floatX)
        return params

    def gru_layer(self,
                  tparams,
                  state_below,
                  options,
                  prefix='gru',
                  mask=None):
        nsteps = state_below.shape[0]
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        assert mask is not None

        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n * dim:(n + 1) * dim]
            return _x[:, n * dim:(n + 1) * dim]

        def _step(m_, x_, h_):
            preact = T.dot(h_, tparams[self._p(prefix, 'Urz')])
            preact += x_[:, 0:2 * options['hidden_units']]

            z = T.nnet.hard_sigmoid(_slice(preact, 0, options['hidden_units']))
            r = T.nnet.hard_sigmoid(_slice(preact, 1, options['hidden_units']))
            h = T.tanh(
                T.dot((h_ * r), tparams[self._p(prefix, 'Uh')]) +
                _slice(x_, 2, options['hidden_units']))

            h = (1.0 - z) * h_ + z * h
            h = m_[:, None] * h + (1. - m_)[:, None] * h_

            return h

        state_below = (T.dot(state_below, tparams[self._p(prefix, 'Wxrz')]) +
                       tparams[self._p(prefix, 'b')])

        hidden_units = options['hidden_units']
        rval, updates = theano.scan(_step,
                                    sequences=[mask, state_below],
                                    outputs_info=T.alloc(
                                        self.numpy_floatX(0.), n_samples,
                                        hidden_units),
                                    name=self._p(prefix, '_layers'),
                                    n_steps=nsteps)
        return rval

    def adam(self,
             loss,
             all_params,
             learning_rate=0.001,
             b1=0.9,
             b2=0.999,
             e=1e-8,
             gamma=1 - 1e-8):
        """
        ADAM update rules
        Default values are taken from [Kingma2014]
    
        References:
        [Kingma2014] Kingma, Diederik, and Jimmy Ba.
        "Adam: A Method for Stochastic Optimization."
        arXiv preprint arXiv:1412.6980 (2014).
        http://arxiv.org/pdf/1412.6980v4.pdf
        """

        updates = OrderedDict()
        all_grads = theano.grad(loss, all_params)
        alpha = learning_rate
        t = theano.shared(np.float32(1).astype(self.floatX))
        b1_t = b1 * gamma**(
            t - 1)  #(Decay the first moment running average coefficient)

        for theta_previous, g in zip(all_params, all_grads):
            m_previous = theano.shared(
                np.zeros(theta_previous.get_value().shape, dtype=self.floatX))
            v_previous = theano.shared(
                np.zeros(theta_previous.get_value().shape, dtype=self.floatX))

            m = b1_t * m_previous + (
                1 - b1_t) * g  # (Update biased first moment estimate)
            v = b2 * v_previous + (
                1 - b2) * g**2  # (Update biased second raw moment estimate)
            m_hat = m / (1 - b1**t
                         )  # (Compute bias-corrected first moment estimate)
            v_hat = v / (
                1 - b2**t
            )  # (Compute bias-corrected second raw moment estimate)
            theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e
                                                        )  #(Update parameters)

            updates[m_previous] = m
            updates[v_previous] = v
            updates[theta_previous] = theta
        updates[t] = t + 1.

        return updates

    def build_model(self, tparams, options):
        trng = RandomStreams(SEED)

        # Used for dropout.
        use_noise = theano.shared(self.numpy_floatX(0.))

        x = T.matrix('x', dtype='int64')
        mask = T.matrix('mask', dtype=self.floatX)
        y = T.vector('y', dtype='int64')

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        emb = tparams['Wemb'][x.flatten()].reshape(
            [n_timesteps, n_samples, options['dim_proj']])
        if options['use_dropout']:
            emb = self.dropout_layer(emb, use_noise, trng, drop_p=0.25)

        proj = self.get_layer(options['encoder'])[1](tparams,
                                                     emb,
                                                     options,
                                                     prefix=options['encoder'],
                                                     mask=mask)

        def compute_alpha(state1, state2):
            tmp = T.nnet.hard_sigmoid(
                T.dot(tparams['W_encoder'], state1.T) +
                T.dot(tparams['W_decoder'], state2.T))
            alpha = T.dot(tparams['bl_vector'], tmp)
            res = T.sum(alpha, axis=0)
            return res

        last_h = proj[-1]

        sim_matrix, _ = theano.scan(fn=compute_alpha,
                                    sequences=proj,
                                    non_sequences=proj[-1])
        att = T.nnet.softmax(sim_matrix.T * mask.T) * mask.T
        p = att.sum(axis=1)[:, None]
        weight = att / p
        atttention_proj = (proj * weight.T[:, :, None]).sum(axis=0)

        proj = T.concatenate([atttention_proj, last_h], axis=1)

        if options['use_dropout']:
            proj = self.dropout_layer(proj, use_noise, trng, drop_p=0.5)

        ytem = T.dot(tparams['Wemb'], tparams['bili'])
        pred = T.nnet.softmax(T.dot(proj, ytem.T))
        # pred = T.nnet.softmax(T.dot(proj, tparams['U']) + tparams['b'])

        f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
        # f_weight = theano.function([x, mask], weight, name='f_weight')

        off = 1e-8
        if pred.dtype == 'float16':
            off = 1e-6

        cost = -T.log(pred[T.arange(n_samples), y] + off).mean()

        return use_noise, x, mask, y, f_pred_prob, cost

    def pred_evaluation(self, f_pred_prob, prepare_data, data, iterator):
        """
        Compute recall@20 and mrr@20
        f_pred_prob: Theano fct computing the prediction
        prepare_data: usual prepare_data for that dataset.
        """
        recall = 0.0
        mrr = 0.0
        evalutation_point_count = 0
        # pred_res = []
        # att = []

        for _, valid_index in iterator:
            x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                      np.array(data[1])[valid_index])
            preds = f_pred_prob(x, mask)
            # weights = f_weight(x, mask)
            targets = y
            ranks = (preds.T > np.diag(preds.T[targets])).sum(axis=0) + 1
            rank_ok = (ranks <= 20)
            # pred_res += list(rank_ok)
            recall += rank_ok.sum()
            mrr += (1.0 / ranks[rank_ok]).sum()
            evalutation_point_count += len(ranks)
            # att.append(weights)

        recall = self.numpy_floatX(recall) / evalutation_point_count
        mrr = self.numpy_floatX(mrr) / evalutation_point_count
        eval_score = (recall, mrr)

        # ff = open('/storage/lijing/mydataset/res_attention_correct.pkl', 'wb')
        # pickle.dump(pred_res, ff)
        # ff.close()
        # ff2 = open('/storage/lijing/mydataset/attention_weights.pkl', 'wb')
        # pickle.dump(att, ff2)
        # ff2.close()

        return eval_score

    def numpy_floatX(self, data):
        return np.asarray(data, dtype=self.floatX)

    def clear(self):
        if hasattr(self, 'tparams'):
            for kk, vv in self.tparams.items():
                if len(self.params[kk].shape) == 1:
                    self.tparams[kk].set_value([])
                else:
                    self.tparams[kk].set_value([[]])

    def support_users(self):
        '''
          whether it is a session-based or session-aware algorithm
          (if returns True, method "predict_with_training_data" must be defined as well)

          Parameters
          --------

          Returns
          --------
          True : if it is session-aware
          False : if it is session-based
        '''
        return True

    def predict_with_training_data(self):
        '''
            (this method must be defined if "support_users is True")
            whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions)

            Parameters
            --------

            Returns
            --------
            True : e.g. hgru4rec
            False : e.g. uvsknn
            '''
        return False

    def extend_session_model_in_loop(self, row, index_user, index_item):
        if not row[index_user] in self.last_user_items:
            self.last_user_items[row[index_user]] = [
            ]  # create a new list to save the user's last viewed items
        self.last_user_items[row[index_user]].append(row[index_item])
        if len(self.last_user_items[
                row[index_user]]) > self.extend_session_length:
            self.last_user_items[row[index_user]] = self.last_user_items[
                row[index_user]][-self.extend_session_length:]

    def predict_for_extended_model(
        self, input_user_id
    ):  # will be called in "evaluation_user_based" and "evaluation_user_based_multiple"
        prev_items = []
        # we add extra items form the user profile at the begening of the session
        self.session_items = list()
        if self.extend_session_length is not None and input_user_id in self.last_user_items:  # user_based
            prev_items = self.last_user_items[input_user_id]
        return prev_items
示例#3
0
class UVMContextKNN:
    '''
    VMContextKNN( k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time')

    Parameters
    -----------
    k : int
        Number of neighboring session to calculate the item scores from. (Default value: 100)
    sample_size : int
        Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
    sampling : string
        String to define the sampling method for sessions (recent, random). (default: recent)
    similarity : string
        String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
    weighting : string
        Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div)
    weighting_score : string
        Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score)
    weighting_time : boolean
        Experimental function to give less weight to items from older sessions (default: False)
    dwelling_time : boolean
        Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False)
    last_n_days : int
        Use only data from the last N days. (default: None)
    last_n_clicks : int
        Use only the last N clicks of the current session when recommending. (default: None)
    extend : bool
        Add evaluated sessions to the maps.
    normalize : bool
        Normalize the scores in the end.
    session_key : string
        Header of the session ID column in the input file. (default: 'SessionId')
    item_key : string
        Header of the item ID column in the input file. (default: 'ItemId')
    time_key : string
        Header of the timestamp column in the input file. (default: 'Time')
    user_key : string
        Header of the user ID column in the input file. (default: 'UserId')

    extend_session_length: int
        extend the current user's session

    extending_mode: string
        how extend the user session (default: lastViewed)
        lastViewed: extend the current user's session with the his/her last viewed items #TODO: now it saves just X last items, and they might be exactly the same: can try as well: save 5 distinctive items
        score_based: higher score: if the items appeared in more previous sessions AND more recently #TODO

    boost_own_sessions: double
        to increase the impact of (give weight more weight to) the sessions which belong to the user. (default: None)
        the value will be added to 1.0. For example for boost_own_sessions=0.2, weight will be 1.2

    past_neighbors: bool
        Include the neighbours of the past user's similar sessions (to the current session) as neighbours (default: False)

    reminders: bool
        Include reminding items in the (main) recommendation list. (default: False)

    remind_strategy: string
        Ranking strategy of the reminding list (recency, session_similarity). (default: recency)

    remind_sessions_num: int
        Number of the last user's sessions that the possible items for reminding are taken from (default: 6)

    reminders_num: int
        length of the reminding list (default: 3)

    remind_mode: string
        The postion of the remining items in recommendation list (top, end). (default: end)

    '''
    def __init__(self,
                 k,
                 sample_size=1000,
                 sampling='recent',
                 similarity='cosine',
                 weighting='div',
                 dwelling_time=False,
                 last_n_days=None,
                 last_n_clicks=None,
                 weighting_score='div',
                 weighting_time=False,
                 normalize=True,
                 idf_weighting=False,
                 idf_weighting_session=False,
                 remind=True,
                 push_reminders=False,
                 add_reminders=False,
                 extend=False,
                 extending_mode='lastViewed',
                 extend_session_length=None,
                 refine_mode=True,
                 boost_own_sessions=None,
                 past_neighbors=False,
                 reminders=False,
                 remind_strategy='recency',
                 remind_sessions_num=6,
                 reminders_num=3,
                 remind_mode='end',
                 weight_base=1,
                 weight_IRec=0,
                 weight_SSim=0,
                 session_key='SessionId',
                 item_key='ItemId',
                 time_key='Time',
                 user_key='UserId'):

        self.k = k
        self.sample_size = sample_size
        self.sampling = sampling
        self.weighting = weighting
        self.dwelling_time = dwelling_time
        self.weighting_score = weighting_score
        self.weighting_time = weighting_time
        self.similarity = similarity
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.user_key = user_key  # user_based
        self.idf_weighting = idf_weighting
        self.idf_weighting_session = idf_weighting_session
        self.normalize = normalize
        self.last_n_days = last_n_days
        self.last_n_clicks = last_n_clicks
        self.remind = remind  # True/False. If False: items form the current session will be excluded
        self.push_reminders = push_reminders
        self.add_reminders = add_reminders
        self.extend = extend
        self.extending_mode = extending_mode
        # user_based
        self.extend_session_length = extend_session_length
        self.boost_own_sessions = boost_own_sessions
        self.past_neighbors = past_neighbors
        self.refine_mode = refine_mode
        # reminders
        self.hasReminders = reminders
        if self.hasReminders:
            if remind_strategy == 'hybrid':
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    weight_base=weight_base,
                    weight_IRec=weight_IRec,
                    weight_SSim=weight_SSim)
            else:  # basic reminders
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    reminders_num=reminders_num,
                    remind_mode=remind_mode)

        # updated while recommending
        self.session = -1
        self.session_items = []
        self.relevant_sessions = set()
        # user_based
        self.items_previous = []
        self.last_user_items = {}  # to extend the session model
        self.recent_user_items = {}  # to remind
        self.recent_user_sessions = {}  # to remind
        self.user_item_intensity = dict(
        )  # to remind (for 'session_similarity')

        # cache relations once at startup (in fit)
        self.session_item_map = dict()
        self.item_session_map = dict()
        self.session_time = dict()
        self.min_time = -1
        self.session_user_map = dict()  # user_based

        self.sim_time = 0

    def fit(self, data, items=None):
        '''
        Trains the predictor.

        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).

        '''

        if self.last_n_days != None:
            max_time = dt.fromtimestamp(data[self.time_key].max())
            date_threshold = max_time.date() - td(self.last_n_days)
            stamp = dt.combine(date_threshold, dt.min.time()).timestamp()
            train = data[data[self.time_key] >= stamp]

        else:
            train = data

        self.num_items = train[self.item_key].max()

        # get the position of the columns
        index_session = train.columns.get_loc(self.session_key)
        index_item = train.columns.get_loc(self.item_key)
        index_time = train.columns.get_loc(self.time_key)
        index_user = train.columns.get_loc(self.user_key)  # user_based

        session = -1
        session_items = set()
        time = -1
        user = -1  # user_based
        # cnt = 0
        prev_s_id = -1
        for row in train.itertuples(index=False):
            # cache items of sessions
            if row[index_session] != session:
                if len(session_items) > 0:
                    self.session_item_map.update({session: session_items})
                    # cache the last time stamp of the session
                    self.session_time.update({session: time})
                    self.session_user_map.update({session: user})  # user_based
                    if time < self.min_time:
                        self.min_time = time
                user = row[index_user]  # user_based
                session = row[index_session]
                session_items = set()
            time = row[index_time]
            session_items.add(row[index_item])

            # cache sessions involving an item
            map_is = self.item_session_map.get(row[index_item])
            if map_is is None:
                map_is = set()
                self.item_session_map.update({row[index_item]: map_is})
            map_is.add(row[index_session])

            # add last viewed items (by the user) to the last_user_items dictionary
            if self.extend_session_length is not None:  # user_based
                self.extend_session_in_fit(row, index_user, index_item)

            # reminders
            if self.hasReminders:  # user_based  # for 'session_similarity' or 'recency'
                self.reminder.reminders_fit_in_loop(row, index_user,
                                                    index_session, index_item)

        # reminders # save item_intensity in the last N session for each user
        if self.hasReminders:  # user_based
            self.reminder.reminders_fit(train, self.user_key, self.item_key,
                                        self.time_key)

        # Add the last tuple
        self.session_item_map.update({session: session_items})
        self.session_time.update({session: time})
        self.session_user_map.update({session: user})  # user_based

        if self.idf_weighting or self.idf_weighting_session:
            self.idf = pd.DataFrame()
            self.idf['idf'] = train.groupby(self.item_key).size()
            self.idf['idf'] = np.log(train[self.session_key].nunique() /
                                     self.idf['idf'])
            self.idf = self.idf['idf'].to_dict()

    def predict_next(self,
                     session_id,
                     input_item_id,
                     input_user_id,
                     predict_for_item_ids=None,
                     skip=False,
                     mode_type='view',
                     timestamp=0):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.

        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event. Must be in the set of item IDs of the training set.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.

        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.

        '''

        #         gc.collect()
        #         process = psutil.Process(os.getpid())
        #         print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')

        if (self.session != session_id):  # new session

            if (self.extend):  # add evaluated sessions to the maps.
                item_set = set(self.session_items)
                self.session_item_map[self.session] = item_set
                for item in item_set:
                    map_is = self.item_session_map.get(item)
                    if map_is is None:
                        map_is = set()
                        self.item_session_map.update({item: map_is})
                    map_is.add(self.session)

                ts = time.time()
                self.session_time.update({self.session: ts})
                self.session_user_map.update({self.session:
                                              input_user_id})  # user_based

            self.last_ts = -1
            self.session = session_id
            self.session_items = list()
            self.dwelling_times = list()
            self.relevant_sessions = set()
            self.items_previous = []  # user_based
            self.need_refine = True  # user_based

        if mode_type == 'view':
            self.session_items.append(input_item_id)
            if self.dwelling_time:
                if self.last_ts > 0:
                    self.dwelling_times.append(timestamp - self.last_ts)
                self.last_ts = timestamp

        if skip:
            return

        items = self.session_items if self.last_n_clicks is None else self.session_items[
            -self.last_n_clicks:]

        # we add extra items form the user profile as long as the session is not long enough!
        if self.extend_session_length is not None and input_user_id in self.last_user_items:  # user_based
            items = self.extend_session_in_predict_next(items, input_user_id)

        neighbors = self.find_neighbors(items, input_item_id, session_id,
                                        self.dwelling_times, timestamp,
                                        input_user_id)

        scores = self.score_items(neighbors, items, timestamp)

        # Create things in the format ..
        predictions = np.zeros(len(predict_for_item_ids))
        mask = np.in1d(predict_for_item_ids, list(scores.keys()))

        predict_for_items = predict_for_item_ids[mask]
        values = [scores[x] for x in predict_for_items]
        predictions[mask] = values
        series = pd.Series(data=predictions, index=predict_for_item_ids)

        if self.hasReminders:  # user_based
            if self.reminder.remind_strategy == 'hybrid':
                if self.reminder.w_SSim == 0:
                    series = self.reminder.reminders_predict_next(
                        input_user_id,
                        series,
                        self.item_key,
                        self.time_key,
                        input_timestamp=timestamp)
                else:
                    past_user_sessions = self.calc_similarity(
                        items,
                        self.reminder.recent_user_sessions[input_user_id],
                        self.dwelling_times, timestamp, input_user_id)
                    series = self.reminder.reminders_predict_next(
                        input_user_id,
                        series,
                        self.item_key,
                        self.time_key,
                        past_user_sessions=past_user_sessions,
                        session_item_map=self.session_item_map,
                        input_timestamp=timestamp)
            else:  # basic reminders
                if self.reminder.remind_strategy == 'session_similarity':
                    past_user_sessions = self.calc_similarity(
                        items,
                        self.reminder.recent_user_sessions[input_user_id],
                        self.dwelling_times, timestamp, input_user_id)
                    series = self.reminder.reminders_predict_next(
                        input_user_id,
                        series,
                        self.item_key,
                        self.time_key,
                        past_user_sessions=past_user_sessions,
                        session_item_map=self.session_item_map)
                if self.reminder.remind_strategy == 'recency':
                    series = self.reminder.reminders_predict_next(
                        input_user_id, series, self.item_key, self.time_key)

        if self.push_reminders:  # give more score to the items that belongs to the current session

            if self.extend_session_length is not None and self.need_refine:
                session_items_series = pd.Series(items)
            else:
                session_items_series = pd.Series(self.session_items)

            session_count = session_items_series.groupby(
                session_items_series).count() + 1
            # multiply the score of the item in the current session by its number of time in the current session
            series[
                session_count.
                index] *= session_count  # TODO: contains the same item several times

        if self.add_reminders:  # force the last 3 items of the current session to be in the top 20 recommendable items
            if self.extend_session_length is not None and self.need_refine:
                # scores of the items that are belong to the current session
                session_series = pd.Series(index=items, data=series[items])
            else:
                # scores of the items that are belong to the current session
                session_series = pd.Series(index=self.session_items,
                                           data=series[self.session_items])
            session_series = session_series[session_series >
                                            0]  # keep them if their scores > 0
            if len(session_series) > 0:
                session_series = session_series.iloc[:
                                                     3]  # keep the first 3 items
                # sort the predictions (sort recommendable items according to their scores)
                series.sort_values(ascending=False, inplace=True)
                session_series = session_series[session_series <
                                                series.iloc[19 -
                                                            3]]  # TODO: 19-3
                series[session_series.
                       index] = series.iloc[19 - 3] + 1e-4  # TODO: 1e-4

        if self.normalize:
            series = series / series.max()

        return series

    def item_pop(self, sessions):
        '''
        Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)

        Parameters
        --------
        sessions: set

        Returns
        --------
        out : dict
        '''
        result = dict()
        max_pop = 0
        for session, weight in sessions:
            items = self.items_for_session(session)
            for item in items:

                count = result.get(item)
                if count is None:
                    result.update({item: 1})
                else:
                    result.update({item: count + 1})

                if (result.get(item) > max_pop):
                    max_pop = result.get(item)

        for key in result:
            result.update({key: (result[key] / max_pop)})

        return result

    def jaccard(self, first, second):
        '''
        Calculates the jaccard index for two sessions

        Parameters
        --------
        first: Id of a session
        second: Id of a session

        Returns
        --------
        out : float value
        '''
        sc = time.clock()
        intersection = len(first & second)
        union = len(first | second)
        res = intersection / union

        self.sim_time += (time.clock() - sc)

        return res

    def cosine(self, first, second):
        '''
        Calculates the cosine similarity for two sessions

        Parameters
        --------
        first: Id of a session
        second: Id of a session

        Returns
        --------
        out : float value
        '''
        li = len(first & second)
        la = len(first)
        lb = len(second)
        result = li / sqrt(la) * sqrt(lb)

        return result

    def tanimoto(self, first, second):
        '''
        Calculates the cosine tanimoto similarity for two sessions

        Parameters
        --------
        first: Id of a session
        second: Id of a session

        Returns
        --------
        out : float value
        '''
        li = len(first & second)
        la = len(first)
        lb = len(second)
        result = li / (la + lb - li)

        return result

    def binary(self, first, second):
        '''
        Calculates the ? for 2 sessions

        Parameters
        --------
        first: Id of a session
        second: Id of a session

        Returns
        --------
        out : float value
        '''
        a = len(first & second)
        b = len(first)
        c = len(second)

        result = (2 * a) / ((2 * a) + b + c)

        return result

    def vec(self, first, second, map):
        '''
        Calculates the ? for 2 sessions

        Parameters
        --------
        first: Id of a session
        second: Id of a session

        Returns
        --------
        out : float value
        '''
        a = first & second
        sum = 0
        for i in a:
            sum += map[i]

        result = sum / len(map)

        return result

    def items_for_session(self, session):
        '''
        Returns all items in the session

        Parameters
        --------
        session: Id of a session

        Returns
        --------
        out : set
        '''
        return self.session_item_map.get(session)

    def vec_for_session(self, session):
        '''
        Returns all items in the session

        Parameters
        --------
        session: Id of a session

        Returns
        --------
        out : set
        '''
        return self.session_vec_map.get(session)

    def sessions_for_item(self, item_id):
        '''
        Returns all session for an item

        Parameters
        --------
        item: Id of the item session

        Returns
        --------
        out : set
        '''
        return self.item_session_map.get(
            item_id) if item_id in self.item_session_map else set()

    def most_recent_sessions(self, sessions, number):
        '''
        Find the most recent sessions in the given set

        Parameters
        --------
        sessions: set of session ids

        Returns
        --------
        out : set
        '''
        sample = set()

        tuples = list()
        for session in sessions:
            time = self.session_time.get(session)
            if time is None:
                print(' EMPTY TIMESTAMP!! ', session)
            tuples.append((session, time))

        tuples = sorted(tuples, key=itemgetter(1), reverse=True)
        # print 'sorted list ', sortedList
        cnt = 0
        for element in tuples:
            cnt = cnt + 1
            if cnt > number:
                break
            sample.add(element[0])
        # print 'returning sample of size ', len(sample)
        return sample

    def possible_neighbor_sessions(self, session_items, input_item_id,
                                   session_id, user_id):
        '''
        Find a set of session to later on find neighbors in.
        A self.sample_size of 0 uses all sessions in which any item of the current session appears.
        self.sampling can be performed with the options "recent" or "random".
        "recent" selects the self.sample_size most recent sessions while "random" just choses randomly.

        Parameters
        --------
        sessions: set of session ids

        Returns
        --------
        out : set
        '''

        # add relevant sessions for the current item
        self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(
            input_item_id)

        if self.past_neighbors:  # user-based
            self.retrieve_past_neighbors(user_id)

        if self.sample_size == 0:  # use all session as possible neighbors

            print('!!!!! runnig KNN without a sample size (check config)')
            possible_neighbors = self.relevant_sessions

        else:  # sample some sessions
            if len(self.relevant_sessions) > self.sample_size:

                if self.sampling == 'recent':
                    sample = self.most_recent_sessions(self.relevant_sessions,
                                                       self.sample_size)
                elif self.sampling == 'random':
                    sample = random.sample(self.relevant_sessions,
                                           self.sample_size)
                else:
                    sample = self.relevant_sessions[:self.sample_size]

                possible_neighbors = sample
            else:
                possible_neighbors = self.relevant_sessions

        return possible_neighbors

    def calc_similarity(self, session_items, sessions, dwelling_times,
                        timestamp, user_id):
        '''
        Calculates the configured similarity for the items in session_items and each session in sessions.

        Parameters
        --------
        session_items: set of item ids
        sessions: list of session ids

        Returns
        --------
        out : list of tuple (session_id,similarity)
        '''

        pos_map = {}
        length = len(session_items)

        count = 1
        for item in session_items:
            if self.weighting is not None:
                pos_map[item] = getattr(self, self.weighting)(count, length)
                count += 1
            else:
                pos_map[item] = 1

        if self.dwelling_time:
            dt = dwelling_times.copy()
            dt.append(0)
            dt = pd.Series(dt, index=session_items)
            dt = dt / dt.max()
            # dt[session_items[-1]] = dt.mean() if len(session_items) > 1 else 1
            dt[session_items[-1]] = 1

            # print(dt)
            for i in range(len(dt)):
                pos_map[session_items[i]] *= dt.iloc[i]
            # print(pos_map)

        if self.idf_weighting_session:
            max = -1
            for item in session_items:
                pos_map[item] = self.idf[item] if item in self.idf else 0
        #                 if pos_map[item] > max:
        #                     max = pos_map[item]
        #             for item in session_items:
        #                 pos_map[item] = pos_map[item] / max

        # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
        items = set(session_items)
        neighbors = []
        cnt = 0
        for session in sessions:
            cnt = cnt + 1
            # get items of the session, look up the cache first
            n_items = self.items_for_session(session)
            sts = self.session_time[session]

            # dot product
            similarity = self.vec(items, n_items, pos_map)
            if similarity > 0:

                if self.weighting_time:
                    diff = timestamp - sts
                    days = round(diff / 60 / 60 / 24)
                    decay = pow(7 / 8, days)
                    similarity *= decay

                # print("days:",days," => ",decay)

                if self.boost_own_sessions is not None:  # user_based
                    similarity = self.apply_boost(session, user_id, similarity)

                neighbors.append((session, similarity))

        return neighbors

    # -----------------
    # Find a set of neighbors, returns a list of tuples (sessionid: similarity)
    # -----------------
    def find_neighbors(self, session_items, input_item_id, session_id,
                       dwelling_times, timestamp, user_id):
        '''
        Finds the k nearest neighbors for the given session_id and the current item input_item_id.

        Parameters
        --------
        session_items: set of item ids
        input_item_id: int
        session_id: int

        Returns
        --------
        out : list of tuple (session_id, similarity)
        '''
        possible_neighbors = self.possible_neighbor_sessions(
            session_items, input_item_id, session_id, user_id)  # user_based
        possible_neighbors = self.calc_similarity(session_items,
                                                  possible_neighbors,
                                                  dwelling_times, timestamp,
                                                  user_id)  # user_based

        possible_neighbors = sorted(possible_neighbors,
                                    reverse=True,
                                    key=lambda x: x[1])
        possible_neighbors = possible_neighbors[:self.k]

        return possible_neighbors

    def score_items(self, neighbors, current_session, timestamp):
        '''
        Compute a set of scores for all items given a set of neighbors.

        Parameters
        --------
        neighbors: set of session ids

        Returns
        --------
        out : list of tuple (item, score)
        '''
        # now we have the set of relevant items to make predictions
        scores = dict()
        item_set = set(current_session)
        # iterate over the sessions
        for session in neighbors:
            # get the items in this session
            items = self.items_for_session(session[0])
            step = 1

            for item in reversed(current_session):
                if item in items:
                    decay = getattr(self,
                                    self.weighting_score + '_score')(step)
                    break
                step += 1

            for item in items:

                if not self.remind and item in item_set:  # d=0* (exclude items form the current session)
                    continue  # dont want to remind user the item that is already is his current (extended) session

                old_score = scores.get(item)
                new_score = session[1]
                new_score = new_score if not self.idf_weighting else new_score + (
                    new_score * self.idf[item] * self.idf_weighting)
                new_score = new_score * decay

                if not old_score is None:
                    new_score = old_score + new_score

                scores.update({item: new_score})

        return scores

    def linear_score(self, i):
        return 1 - (0.1 * i) if i <= 100 else 0

    def same_score(self, i):
        return 1

    def div_score(self, i):
        return 1 / i

    def log_score(self, i):
        return 1 / (log10(i + 1.7))

    def quadratic_score(self, i):
        return 1 / (i * i)

    def linear(self, i, length):
        return 1 - (0.1 * (length - i)) if i <= 10 else 0

    def same(self, i, length):
        return 1

    def div(self, i, length):
        return i / length

    def log(self, i, length):
        return 1 / (log10((length - i) + 1.7))

    def quadratic(self, i, length):
        return (i / length)**2

    def clear(self):
        self.session = -1
        self.session_items = []
        self.relevant_sessions = set()

        self.session_item_map = dict()
        self.item_session_map = dict()
        self.session_time = dict()
        self.session_user_map = dict()  # user_based

    def support_users(self):
        '''
            whether it is a session-based or session-aware algorithm
            (if returns True, method "predict_with_training_data" must be defined as well)

            Parameters
            --------

            Returns
            --------
            True : if it is session-aware
            False : if it is session-based
        '''
        return True

    def predict_with_training_data(self):
        '''
            (this method must be defined if "support_users is True")
            whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions)

            Parameters
            --------

            Returns
            --------
            True : e.g. hgru4rec
            False : e.g. uvsknn
            '''
        return False

    def extend_session_in_fit(self, row, index_user, index_item):
        if not row[index_user] in self.last_user_items:
            # create a new list to save the user's last viewed items
            self.last_user_items[row[index_user]] = []
        self.last_user_items[row[index_user]].append(row[index_item])
        if len(self.last_user_items[
                row[index_user]]) > self.extend_session_length:
            self.last_user_items[row[index_user]] = self.last_user_items[
                row[index_user]][-self.extend_session_length:]

    def extend_session_in_predict_next(self, items, input_user_id):
        if len(items) < self.extend_session_length:
            # update the session with items from the users past
            n = len(self.session_items)
            addItems = self.extend_session_length - n
            prev_items = self.last_user_items[input_user_id][-addItems:]
            items = prev_items + self.session_items

            # if it is beginning of the session => find relevant sessions for added items
            if len(self.items_previous) == 0:
                for item in set(prev_items):
                    self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(
                        item)
            # not beginning of the session, so we already retrieved neighbours for the extended session
            elif self.refine_mode:
                # if the first item that was in the previous step, is not in the current step anymore => refine the self.relevant_sessions
                if not self.items_previous[0] in items:
                    self.relevant_sessions = set()
                    for item in set(items):
                        self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(
                            item)

            # update self.items_previous
            self.items_previous = items
        # the session is long enough => refine the self.relevant_sessions to just consider current session's items
        elif self.refine_mode and self.need_refine:
            self.relevant_sessions = set()
            for item in set(self.session_items):
                # then we can continue with just adding related sessions for the current item
                self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(
                    item)
            # refined once after reach to the defined length, no need to do refine anymore
            self.need_refine = False

        return items

    def apply_boost(self, session, user_id, similarity):
        if self.boost_own_sessions > 0.0 and self.session_user_map[
                session] == user_id:
            similarity = similarity + (similarity * self.boost_own_sessions)
        return similarity

    def retrieve_past_neighbors(self, user_id):
        for neighbor_sid in self.relevant_sessions:
            if self.session_user_map[neighbor_sid] == user_id:
                for item in self.items_for_session(neighbor_sid):
                    self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(
                        item)
示例#4
0
class USequentialRules:
    '''
    Code based on work by Kamehkhosh et al.,A Comparison of Frequent Pattern Techniques and a Deep Learning Method for Session-Based Recommendation, TempRec Workshop at ACM RecSys 2017.

    SequentialRules(steps = 3, weighting='div', pruning=0.0)

    Parameters
    --------
    steps : int
        TODO. (Default value: 3)
    weighting : string
        TODO. (Default value: 3)
    pruning : float
        TODO. (Default value: 0)

    session_key : string
        Header of the session ID column in the input file. (default: 'SessionId')
    item_key : string
        Header of the item ID column in the input file. (default: 'ItemId')
    time_key : string
        Header of the timestamp column in the input file. (default: 'Time')
    user_key : string
        Header of the user ID column in the input file. (default: 'UserId')

    boost_own_sessions: double
        to increase the impact of (give weight more weight to) the sessions which belong to the user. (default: None)
        the value will be added to 1.0. For example for boost_own_sessions=0.2, weight will be 1.2

    reminders: bool
        Include reminding items in the (main) recommendation list. (default: False)

    remind_strategy: string
        Ranking strategy of the reminding list (default: recency)

    remind_sessions_num: int
        Number of the last user's sessions that the possible items for reminding are taken from (default: 6)

    reminders_num: int
        length of the reminding list (default: 3)

    remind_mode: string
        The postion of the remining items in recommendation list (top, end). (default: end)


    '''
    def __init__(self,
                 steps=10,
                 weighting='div',
                 pruning=20,
                 last_n_days=None,
                 idf_weight=False,
                 last_in_session=False,
                 session_weighting='div',
                 boost_own_sessions=None,
                 reminders=False,
                 remind_strategy='recency',
                 remind_sessions_num=6,
                 reminders_num=3,
                 remind_mode='end',
                 weight_base=1,
                 weight_IRec=0,
                 session_key='SessionId',
                 item_key='ItemId',
                 time_key='Time',
                 user_key='UserId'):
        self.steps = steps
        self.pruning = pruning
        self.weighting = weighting
        self.session_weighting = session_weighting
        self.last_n_days = last_n_days
        self.idf_weight = idf_weight
        self.last_in_session = last_in_session
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.session = -1
        self.session_items = []
        # user_based
        self.user_key = user_key
        self.boost_own_sessions = boost_own_sessions

        self.hasReminders = reminders
        if self.hasReminders:
            if remind_strategy == 'hybrid':
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    weight_base=weight_base,
                    weight_IRec=weight_IRec)
            else:  # basic reminders
                self.reminder = Reminder(
                    remind_strategy=remind_strategy,
                    remind_sessions_num=remind_sessions_num,
                    reminders_num=reminders_num,
                    remind_mode=remind_mode)

    def fit(self, data, test=None):
        '''
        Trains the predictor.

        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).


        '''

        if self.last_n_days != None:

            max_time = dt.fromtimestamp(data[self.time_key].max())
            date_threshold = max_time.date() - td(self.last_n_days)
            stamp = dt.combine(date_threshold, dt.min.time()).timestamp()
            train = data[data[self.time_key] >= stamp]

        else:
            train = data

        if self.idf_weight:
            self.idf = self.compute_idf(data,
                                        item_key=self.item_key,
                                        session_key=self.session_key)

        cur_session = -1
        last_items = []
        rules = dict()
        # In SR: rule-set is a dic like: {item_a: {item_b: score}, item_b: {item_c: score, item_d: score, item_a: score}}
        # In user-based SR: rule-set is a dic like: {item_a: {item_b: [score, {userId1}]}, item_b: {item_c: [score, {userId_1, userId_2}], item_d: [score, {userId_2, userId_3}], item_a: [score, {userId_4}]}}
        # fist element of the list is : score, the rest is a SET of user ids who had this rule in their past sessions

        # get the position of the columns
        index_session = train.columns.get_loc(self.session_key)
        index_item = train.columns.get_loc(self.item_key)
        index_user = train.columns.get_loc(self.user_key)  # user_based

        for row in train.itertuples(index=False):

            session_id, item_id, user_id = row[index_session], row[
                index_item], row[index_user]

            if session_id != cur_session:
                cur_session = session_id
                last_items = []
            else:
                for i in range(
                        1, self.steps + 1 if len(last_items) >= self.steps else
                        len(last_items) + 1):
                    prev_item = last_items[-i]

                    if not prev_item in rules:
                        rules[prev_item] = dict()

                    if not item_id in rules[prev_item]:
                        userSet = set()
                        rules[prev_item][item_id] = [0, userSet]

                    if not user_id in rules[prev_item][item_id][
                            1]:  # in userSet
                        rules[prev_item][item_id][1].add(user_id)

                    weight = getattr(self, self.weighting)(i)
                    if self.idf_weight:
                        if self.idf_weight == 1:
                            weight *= self.idf[prev_item]
                        elif self.idf_weight == 2:
                            weight += self.idf[prev_item]

                    rules[prev_item][item_id][0] += weight

            last_items.append(item_id)

            # reminders
            if self.hasReminders:  # user_based  # for 'session_similarity' or 'recency'
                self.reminder.reminders_fit_in_loop(row, index_user,
                                                    index_session, index_item)
                # prev_s_id = self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item, prev_s_id)

        if self.pruning > 0:
            self.prune(rules)

        self.rules = rules

        # reminders
        if self.hasReminders:  # user_based
            self.reminder.reminders_fit(train, self.user_key, self.item_key,
                                        self.time_key)

    #         print( 'Size of map: ', asizeof.asizeof(self.rules))

    def linear(self, i):
        return 1 - (0.1 * i) if i <= 100 else 0

    def same(self, i):
        return 1

    def div(self, i):
        return 1 / i

    def log(self, i):
        return 1 / (log10(i + 1.7))

    def quadratic(self, i):
        return 1 / (i * i)

    def predict_next(self,
                     session_id,
                     input_item_id,
                     input_user_id,
                     predict_for_item_ids,
                     skip=False,
                     mode_type='view',
                     timestamp=0):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.

        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.

        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.

        '''
        if session_id != self.session:
            self.session_items = []
            self.session = session_id

        if mode_type == 'view':
            self.session_items.append(input_item_id)

        if skip:
            return

        preds = np.zeros(len(predict_for_item_ids))

        # useless: add extend_session_length to make predictions
        if input_item_id in self.rules:
            for key in self.rules[input_item_id]:
                # preds[predict_for_item_ids == key] = self.rules[input_item_id][key]
                preds[predict_for_item_ids ==
                      key] = self.rules[input_item_id][key][0]
                if self.boost_own_sessions is not None and self.boost_own_sessions > 0.0 and input_user_id in self.rules[
                        input_item_id][key][
                            1]:  # if the rule also belong to the same user_id, then boost its score!
                    preds[predict_for_item_ids == key] = preds[
                        predict_for_item_ids == key] + self.rules[
                            input_item_id][key][0] * self.boost_own_sessions

        if self.last_in_session:
            for i in range(2, self.last_in_session + 2):
                if len(self.session_items) >= i:
                    item = self.session_items[-i]
                    if item in self.rules:
                        for key in self.rules[item]:
                            preds[predict_for_item_ids ==
                                  key] += self.rules[item][key] * getattr(
                                      self, self.session_weighting)(i)
                else:
                    break

        # test
        #         for i in range(2,4):
        #             if len(self.session_items) >= i :
        #                 item = self.session_items[-i]
        #                 for key in self.rules[ item ]:
        #                     preds[ predict_for_item_ids == key ] += self.rules[item][key] * (1/i)

        series = pd.Series(data=preds, index=predict_for_item_ids)

        series = series / series.max()

        if self.hasReminders:  # user_based
            if self.reminder.remind_strategy == 'hybrid':
                series = self.reminder.reminders_predict_next(
                    input_user_id,
                    series,
                    self.item_key,
                    self.time_key,
                    input_timestamp=timestamp)
            else:  # basic reminders
                series = self.reminder.reminders_predict_next(
                    input_user_id, series, self.item_key, self.time_key)

        return series

    def prune(self, rules):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
        Parameters
            --------
            rules : dict of dicts
                The rules mined from the training data
        '''
        for k1 in rules:
            tmp = rules[k1]
            if self.pruning < 1:
                keep = len(tmp) - int(len(tmp) * self.pruning)
            elif self.pruning >= 1:
                keep = self.pruning
            counter = col.Counter(tmp)
            rules[k1] = dict()
            for k2, v in counter.most_common(keep):
                rules[k1][k2] = v

    def compute_idf(self, train, item_key="ItemId", session_key="SessionId"):

        idf = pd.DataFrame()
        idf['idf'] = train.groupby(item_key).size()
        idf['idf'] = np.log(train[session_key].nunique() / idf['idf'])
        idf['idf'] = (idf['idf'] - idf['idf'].min()) / (idf['idf'].max() -
                                                        idf['idf'].min())
        idf = idf['idf'].to_dict()

        return idf

    def clear(self):
        self.rules = {}

    def support_users(self):
        '''
            whether it is a session-based or session-aware algorithm
            (if returns True, method "predict_with_training_data" must be defined as well)

            Parameters
            --------

            Returns
            --------
            True : if it is session-aware
            False : if it is session-based
        '''
        return True

    def predict_with_training_data(self):
        '''
            (this method must be defined if "support_users is True")
            whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions)

            Parameters
            --------

            Returns
            --------
            True : e.g. hgru4rec
            False : e.g. uvsknn
            '''
        return False