class USTAN: ''' STAN( k, sample_size=5000, sampling='recent', remind=True, extend=False, lambda_spw=1.02, lambda_snh=5, lambda_inh=2.05 , session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time' ) Parameters ----------- k : int Number of neighboring session to calculate the item scores from. (Default value: 100) sample_size : int Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) sampling : string String to define the sampling method for sessions (recent, random). (default: recent) remind : string String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) extend : string Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div) lambda_spw : string Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score) lambda_snh : boolean Experimental function to give less weight to items from older sessions (default: False) lambda_inh : boolean Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False) session_key : string Header of the session ID column in the input file. (default: 'SessionId') item_key : string Header of the item ID column in the input file. (default: 'ItemId') time_key : string Header of the timestamp column in the input file. (default: 'Time') ''' def __init__( self, k, sample_size=5000, sampling='recent', remind=True, extend=False, lambda_spw=1.02, lambda_snh=5, lambda_inh=2.05 , extend_session_length=None, extending_mode='lastViewed', refine_mode=True, boost_own_sessions=None, reminders=False, remind_strategy='recency', remind_sessions_num=6, reminders_num=3, remind_mode='end', weight_base=1, weight_IRec=0, weight_SSim=0, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time', user_key='UserId'): self.k = k self.sample_size = sample_size self.sampling = sampling self.lambda_spw = lambda_spw self.lambda_snh = lambda_snh * 24 * 3600 self.lambda_inh = lambda_inh self.session_key = session_key self.item_key = item_key self.time_key = time_key self.user_key = user_key # user_based self.extend = extend self.remind = remind self.extending_mode = extending_mode # user_based self.extend_session_length = extend_session_length self.refine_mode = refine_mode self.boost_own_sessions = boost_own_sessions #updated while recommending self.session = -1 self.session_items = [] self.relevant_sessions = set() # user_based self.items_previous = [] self.last_user_items = {} # to extend the session model self.recent_user_items = {} # to remind self.recent_user_sessions = {} # to remind self.user_item_intensity = dict() # to remind (for 'session_similarity') # reminders self.hasReminders = reminders if self.hasReminders: if remind_strategy == 'hybrid': self.reminder = Reminder(remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, weight_base=weight_base, weight_IRec=weight_IRec, weight_SSim=weight_SSim) else: # basic reminders self.reminder = Reminder(remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, reminders_num=reminders_num, remind_mode=remind_mode) # cache relations once at startup self.session_item_map = dict() self.item_session_map = dict() self.session_time = dict() self.min_time = -1 self.session_user_map = dict() # user_based self.sim_time = 0 def fit(self, train, test=None, items=None): ''' Trains the predictor. Parameters -------- data: pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). ''' self.num_items = train[self.item_key].max() index_session = train.columns.get_loc( self.session_key ) index_item = train.columns.get_loc( self.item_key ) index_time = train.columns.get_loc( self.time_key ) index_user = train.columns.get_loc(self.user_key) # user_based session = -1 session_items = [] time = -1 user = -1 # user_based #cnt = 0 for row in train.itertuples(index=False): # cache items of sessions if row[index_session] != session: if len(session_items) > 0: self.session_item_map.update({session : session_items}) # cache the last time stamp of the session self.session_time.update({session : time}) self.session_user_map.update({session: user}) # user_based if time < self.min_time: self.min_time = time user = row[index_user] # user_based session = row[index_session] session_items = [] time = row[index_time] session_items.append(row[index_item]) # cache sessions involving an item map_is = self.item_session_map.get( row[index_item] ) if map_is is None: map_is = set() self.item_session_map.update({row[index_item] : map_is}) map_is.add(row[index_session]) # add last viewed items (by the user) to the last_user_items dictionary if self.extend_session_length is not None: # user_based self.extend_session_in_fit(row, index_user, index_item) # reminders if self.hasReminders: # user_based # for 'session_similarity' or 'recency' self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item) # reminders # save item_intensity in the last N session for each user if self.hasReminders: # user_based self.reminder.reminders_fit(train, self.user_key, self.item_key, self.time_key) # Add the last tuple self.session_item_map.update({session : session_items}) self.session_time.update({session : time}) self.session_user_map.update({session: user}) # user_based if self.sample_size == 0: #use all session as possible neighbors print('!!!!! runnig KNN without a sample size (check config)') def predict_next( self, session_id, input_item_id, input_user_id, predict_for_item_ids=None, timestamp=0, skip=False, type='view'): ''' Gives predicton scores for a selected set of items on how likely they be the next item in the session. Parameters -------- session_id : int or string The session IDs of the event. input_item_id : int or string The item ID of the event. Must be in the set of item IDs of the training set. predict_for_item_ids : 1D array IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. Returns -------- out : pandas.Series Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. ''' # gc.collect() # process = psutil.Process(os.getpid()) # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') if( self.session != session_id ): #new session if( self.extend ): self.session_item_map[self.session] = self.session_items; for item in self.session_items: map_is = self.item_session_map.get( item ) if map_is is None: map_is = set() self.item_session_map.update({item : map_is}) map_is.add(self.session) ts = time.time() self.session_time.update({self.session : ts}) self.session_user_map.update({self.session: input_user_id}) # user_based self.session = session_id self.session_items = list() self.relevant_sessions = set() self.items_previous = [] # user_based self.need_refine = True # user_based if type == 'view': self.session_items.append( input_item_id ) if skip: return items = self.session_items # we add extra items form the user profile as long as the session is not long enough! if self.extend_session_length is not None and input_user_id in self.last_user_items: # user_based items = self.extend_session_in_predict_next(items, input_user_id) neighbors = self.find_neighbors( items, input_item_id, session_id, timestamp, input_user_id) scores = self.score_items( neighbors, items, timestamp ) # Create things in the format .. predictions = np.zeros(len(predict_for_item_ids)) mask = np.in1d( predict_for_item_ids, list(scores.keys()) ) items = predict_for_item_ids[mask] values = [scores[x] for x in items] predictions[mask] = values series = pd.Series(data=predictions, index=predict_for_item_ids) if self.hasReminders: # user_based if self.reminder.remind_strategy == 'hybrid': if self.reminder.w_SSim == 0: series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key, input_timestamp=timestamp) else: past_user_sessions = self.calc_similarity(items, self.reminder.recent_user_sessions[input_user_id], timestamp, input_user_id) series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key, past_user_sessions=past_user_sessions, session_item_map=self.session_item_map, input_timestamp=timestamp) else: # basic reminders if self.reminder.remind_strategy == 'session_similarity': past_user_sessions = self.calc_similarity(items, self.reminder.recent_user_sessions[input_user_id], timestamp, input_user_id) series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key, past_user_sessions=past_user_sessions, session_item_map=self.session_item_map) if self.reminder.remind_strategy == 'recency': series = self.reminder.reminders_predict_next(input_user_id, series, self.item_key, self.time_key) return series def vec(self, current, neighbor, pos_map): ''' Calculates the ? for 2 sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' intersection = current & neighbor vp_sum = 0 for i in intersection: vp_sum += pos_map[i] result = vp_sum / len(pos_map) return result def cosine(self, current, neighbor, pos_map): ''' Calculates the cosine similarity for two sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' lneighbor = len(neighbor) intersection = current & neighbor if pos_map is not None: vp_sum = 0 current_sum = 0 for i in current: current_sum += pos_map[i] * pos_map[i] if i in intersection: vp_sum += pos_map[i] else: vp_sum = len( intersection ) current_sum = len( current ) result = vp_sum / (sqrt(current_sum) * sqrt(lneighbor)) return result def items_for_session(self, session): ''' Returns all items in the session Parameters -------- session: Id of a session Returns -------- out : set ''' return self.session_item_map.get(session); def sessions_for_item(self, item_id): ''' Returns all session for an item Parameters -------- item: Id of the item session Returns -------- out : set ''' return self.item_session_map.get( item_id ) if item_id in self.item_session_map else set() def most_recent_sessions( self, sessions, number ): ''' Find the most recent sessions in the given set Parameters -------- sessions: set of session ids Returns -------- out : set ''' sample = set() tuples = list() for session in sessions: time = self.session_time.get( session ) if time is None: print(' EMPTY TIMESTAMP!! ', session) tuples.append((session, time)) tuples = sorted(tuples, key=itemgetter(1), reverse=True) #print 'sorted list ', sortedList cnt = 0 for element in tuples: cnt = cnt + 1 if cnt > number: break sample.add( element[0] ) #print 'returning sample of size ', len(sample) return sample #----------------- # Find a set of neighbors, returns a list of tuples (sessionid: similarity) #----------------- def find_neighbors( self, session_items, input_item_id, session_id, timestamp, user_id): ''' Finds the k nearest neighbors for the given session_id and the current item input_item_id. Parameters -------- session_items: set of item ids input_item_id: int session_id: int Returns -------- out : list of tuple (session_id, similarity) ''' possible_neighbors = self.possible_neighbor_sessions( session_items, input_item_id, session_id, user_id) possible_neighbors = self.calc_similarity( session_items, possible_neighbors, timestamp, user_id) possible_neighbors = sorted( possible_neighbors, reverse=True, key=lambda x: x[1] ) possible_neighbors = possible_neighbors[:self.k] return possible_neighbors def possible_neighbor_sessions(self, session_items, input_item_id, session_id, user_id): ''' Find a set of session to later on find neighbors in. A self.sample_size of 0 uses all sessions in which any item of the current session appears. self.sampling can be performed with the options "recent" or "random". "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. Parameters -------- sessions: set of session ids Returns -------- out : set ''' self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( input_item_id ) if self.sample_size == 0: #use all session as possible neighbors #print('!!!!! runnig KNN without a sample size (check config)') return self.relevant_sessions else: #sample some sessions if len(self.relevant_sessions) > self.sample_size: if self.sampling == 'recent': sample = self.most_recent_sessions( self.relevant_sessions, self.sample_size ) elif self.sampling == 'random': sample = random.sample( self.relevant_sessions, self.sample_size ) else: sample = self.relevant_sessions[:self.sample_size] return sample else: return self.relevant_sessions def calc_similarity(self, session_items, sessions, timestamp, user_id): ''' Calculates the configured similarity for the items in session_items and each session in sessions. Parameters -------- session_items: set of item ids sessions: list of session ids Returns -------- out : list of tuple (session_id,similarity) ''' pos_map = None if self.lambda_spw: pos_map = {} length = len( session_items ) pos = 1 for item in session_items: if self.lambda_spw is not None: pos_map[item] = self.session_pos_weight( pos, length, self.lambda_spw ) pos += 1 #print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric items = set(session_items) neighbors = [] cnt = 0 for session in sessions: cnt = cnt + 1 # get items of the session, look up the cache first n_items = self.items_for_session( session ) similarity = self.cosine(items, set(n_items), pos_map) if self.lambda_snh is not None: sts = self.session_time[session] decay = self.session_time_weight(timestamp, sts, self.lambda_snh) similarity *= decay if self.boost_own_sessions is not None: # user_based similarity = self.apply_boost(session, user_id, similarity) neighbors.append((session, similarity)) return neighbors def session_pos_weight(self, position, length, lambda_spw): diff = position - length return exp( diff / lambda_spw ) def session_time_weight(self, ts_current, ts_neighbor, lambda_snh): diff = ts_current - ts_neighbor return exp( - diff / lambda_snh ) def score_items(self, neighbors, current_session, timestamp): ''' Compute a set of scores for all items given a set of neighbors. Parameters -------- neighbors: set of session ids Returns -------- out : list of tuple (item, score) ''' # now we have the set of relevant items to make predictions scores = dict() s_items = set( current_session ) # iterate over the sessions for session in neighbors: # get the items in this session n_items = self.items_for_session( session[0] ) pos_last = {} pos_i_star = None for i in range( len( n_items ) ): if n_items[i] in s_items: pos_i_star = i + 1 pos_last[n_items[i]] = i + 1 n_items = set( n_items ) for item in n_items: if not self.remind and item in s_items: continue old_score = scores.get( item ) new_score = session[1] if self.lambda_inh is not None: new_score = new_score * self.item_pos_weight( pos_last[item], pos_i_star, self.lambda_inh ) if not old_score is None: new_score = old_score + new_score scores.update({item : new_score}) return scores def item_pos_weight(self, pos_candidate, pos_item, lambda_inh): diff = abs( pos_candidate - pos_item ) return exp( - diff / lambda_inh ) def clear(self): self.session = -1 self.session_items = [] self.relevant_sessions = set() self.session_item_map = dict() self.item_session_map = dict() self.session_time = dict() self.session_user_map = dict() # user_based def support_users(self): ''' whether it is a session-based or session-aware algorithm (if returns True, method "predict_with_training_data" must be defined as well) Parameters -------- Returns -------- True : if it is session-aware False : if it is session-based ''' return True def predict_with_training_data(self): ''' (this method must be defined if "support_users is True") whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions) Parameters -------- Returns -------- True : e.g. hgru4rec False : e.g. uvsknn ''' return False def extend_session_in_fit(self, row, index_user, index_item): if not row[index_user] in self.last_user_items: # create a new list to save the user's last viewed items self.last_user_items[row[index_user]] = [] self.last_user_items[row[index_user]].append(row[index_item]) if len(self.last_user_items[row[index_user]]) > self.extend_session_length: self.last_user_items[row[index_user]] = self.last_user_items[row[index_user]][ -self.extend_session_length:] def extend_session_in_predict_next(self, items, input_user_id): if len(items) < self.extend_session_length: # update the session with items from the users past n = len(self.session_items) addItems = self.extend_session_length - n prev_items = self.last_user_items[input_user_id][-addItems:] items = prev_items + self.session_items # if it is beginning of the session => find relevant sessions for added items if len(self.items_previous) == 0: for item in set(prev_items): self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item) # not beginning of the session, so we already retrieved neighbours for the extended session elif self.refine_mode: # if the first item that was in the previous step, is not in the current step anymore => refine the self.relevant_sessions if not self.items_previous[0] in items: self.relevant_sessions = set() for item in set(items): self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item) # update self.items_previous self.items_previous = items # the session is long enough => refine the self.relevant_sessions to just consider current session's items elif self.refine_mode and self.need_refine: self.relevant_sessions = set() for item in set(self.session_items): # then we can continue with just adding related sessions for the current item self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(item) # refined once after reach to the defined length, no need to do refine anymore self.need_refine = False return items def apply_boost(self, session, user_id, similarity): if self.boost_own_sessions > 0.0 and self.session_user_map[session] == user_id: similarity = similarity + (similarity * self.boost_own_sessions) return similarity
class UNARM: ''' Code based on work by Li et al., Neural Attentive Session-based Recommendation, CIKM 2017. NARM(factors=100, session_key='SessionId', item_key='ItemId') Popularity predictor that gives higher scores to items with larger support. The score is given by: .. math:: r_{i}=\\frac{supp_i}{(1+supp_i)} Parameters -------- top_n : int Only give back non-zero scores to the top N ranking items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100) item_key : string The header of the item IDs in the training data. (Default value: 'ItemId') support_by_key : string or None If not None, count the number of unique values of the attribute of the training data given by the specified header. If None, count the events. (Default value: None) ''' def __init__(self, factors=100, hidden_units=100, epochs=30, lr=0.001, extend_session_length=None, reminders=False, remind_strategy='recency', remind_sessions_num=6, reminders_num=3, remind_mode='end', weight_base=1, weight_IRec=0, session_key='SessionId', item_key='ItemId', time_key='Time', user_key='UserId'): self.factors = factors self.hidden_units = hidden_units self.factors = factors self.epochs = epochs self.lr = lr self.session_key = session_key self.item_key = item_key self.session = -1 self.session_items = list() self.floatX = theano.config.floatX # user_based self.time_key = time_key self.user_key = user_key self.extend_session_length = extend_session_length self.last_user_items = {} self.recent_user_items = {} self.recent_user_sessions = {} self.hasReminders = reminders self.hasReminders = reminders if self.hasReminders: if remind_strategy == 'hybrid': self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, weight_base=weight_base, weight_IRec=weight_IRec) else: # basic reminders self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, reminders_num=reminders_num, remind_mode=remind_mode) def fit(self, data, test=None): ''' Trains the predictor. Parameters -------- data: pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). ''' nis = data[self.item_key].nunique() self.itemmap = pd.Series(index=data[self.item_key].unique(), data=range(1, nis + 1)) data = data.merge(self.itemmap.to_frame('ItemIdx'), how='inner', right_index=True, left_on=self.item_key) data.sort_values(['SessionId', 'Time'], inplace=True) self.traindata = self.create_training_data(data) self.dataload = (self.load_data, self.prepare_data) self.layers = {'gru': (self.param_init_gru, self.gru_layer)} self.train_gru(self.factors, self.hidden_units, max_epochs=self.epochs, lrate=self.lr, n_items=nis + 1) def train_gru( self, dim_proj=50, # embeding dimension hidden_units=100, # GRU number of hidden units. patience=5, # Number of epoch to wait before early stop if no progress max_epochs=30, # The maximum number of epoch to run dispFreq=10000, # Display to stdout the training progress every N updates lrate=0.001, # Learning rate n_items=37484, # Vocabulary size encoder='gru', # TODO: can be removed must be gru. saveto='gru_model.npz', # The best model will be saved there is_valid=True, # Compute the validation error after this number of update. is_save=False, # Save the parameters after every saveFreq updates batch_size=512, # The batch size during training. valid_batch_size=512, # The batch size used for validation/test set. # Parameter for extra option use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print("model options", model_options) load_data, prepare_data = self.get_dataset() print('Loading data') train, valid = load_data() print('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = self.init_params(model_options) if reload_model: self.load_params('gru_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = self.init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, cost) = self.build_model(tparams, model_options) self.pred_function = f_pred_prob all_params = list(tparams.values()) updates = self.adam(cost, all_params, lrate) train_function = theano.function(inputs=[x, mask, y], outputs=cost, updates=updates) print('Optimization') print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) history_errs = [] history_vali = [] best_p = None bad_count = 0 uidx = 0 # the number of update done estop = False # early stop try: for eidx in range(max_epochs): start_time = time.time() n_samples = 0 epoch_loss = [] # Get new shuffled index for the training set. kf = self.get_minibatches_idx(len(train[0]), batch_size, shuffle=True) kf_valid = self.get_minibatches_idx(len(valid[0]), valid_batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = prepare_data(x, y) n_samples += x.shape[1] loss = train_function(x, mask, y) epoch_loss.append(loss) if np.isnan(loss) or np.isinf(loss): print('bad loss detected: ', loss) return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Loss ', np.mean(epoch_loss)) if saveto and is_save: print('Saving...') if best_p is not None: params = best_p else: params = self.unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) print('Saving done') if is_valid: use_noise.set_value(0.) valid_evaluation = self.pred_evaluation( f_pred_prob, prepare_data, valid, kf_valid) history_errs.append([valid_evaluation]) if best_p is None or valid_evaluation[1] >= np.array( history_vali).max(): best_p = self.unzip(tparams) print('Best perfomance updated!') bad_count = 0 print('Valid Recall@20:', valid_evaluation[0], ' Valid Mrr@20:', valid_evaluation[1]) if len(history_vali) > 10 and valid_evaluation[ 1] <= np.array(history_vali).max(): bad_count += 1 print('===========================>Bad counter: ' + str(bad_count)) print('current validation mrr: ' + str(valid_evaluation[1]) + ' history max mrr:' + str(np.array(history_vali).max())) if bad_count > patience: print('Early Stop!') estop = True history_vali.append(valid_evaluation[1]) end_time = time.time() print('Seen %d samples' % n_samples) print(('This epoch took %.1fs' % (end_time - start_time)), file=sys.stderr) if estop: break except KeyboardInterrupt: print("Training interupted") if best_p is not None: self.zipp(best_p, tparams) else: best_p = self.unzip(tparams) use_noise.set_value(0.) valid_evaluation = self.pred_evaluation(f_pred_prob, prepare_data, valid, kf_valid) print('=================Best performance=================') print('Valid Recall@20:', valid_evaluation[0], ' Valid Mrr@20:', valid_evaluation[1]) print('==================================================') if saveto and is_save: np.savez('Best_performance', valid_evaluation=valid_evaluation, history_errs=history_errs, **best_p) self.params = params self.tparams = tparams return valid_evaluation def create_training_data(self, data): index_session = data.columns.get_loc(self.session_key) index_item = data.columns.get_loc('ItemIdx') index_item_original = data.columns.get_loc('ItemId') index_user = data.columns.get_loc('UserId') out_seqs = [] labs = [] session = -1 session_items = [] for row in data.itertuples(index=False): # add last viewed items (by the user) to the last_user_items dictionary if self.extend_session_length is not None: # user_based self.extend_session_model_in_loop(row, index_user, index_item_original) # reminders if self.hasReminders: # user_based # for 'session_similarity' or 'recency' self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item_original) # cache items of sessions if row[index_session] != session: session = row[index_session] session_items = list() session_items.append(row[index_item]) if len(session_items) > 1: out_seqs += [session_items[:-1]] labs += [session_items[-1]] # reminders if self.hasReminders: # user_based self.reminder.reminders_fit(data, self.user_key, self.item_key, self.time_key) return out_seqs, labs def prepare_data(self, seqs, labels): """Create the matrices from the datasets. This pad each sequence to the same lenght: the lenght of the longuest sequence or maxlen. if maxlen is set, we will cut all sequence to this maximum lenght. This swap the axis! """ # x: a list of sentences lengths = [len(s) for s in seqs] n_samples = len(seqs) maxlen = np.max(lengths) x = np.zeros((maxlen, n_samples)).astype('int64') x_mask = np.ones((maxlen, n_samples)).astype(self.floatX) for idx, s in enumerate(seqs): x[:lengths[idx], idx] = s x_mask *= (1 - (x == 0)) return x, x_mask, labels def load_data(self, valid_portion=0.1, maxlen=19, sort_by_len=False): '''Loads the dataset :type path: String :param path: The path to the dataset (here RSC2015) :type n_items: int :param n_items: The number of items. :type valid_portion: float :param valid_portion: The proportion of the full train set used for the validation set. :type maxlen: None or positive int :param maxlen: the max sequence length we use in the train/valid set. :type sort_by_len: bool :name sort_by_len: Sort by the sequence lenght for the train, valid and test set. This allow faster execution as it cause less padding per minibatch. Another mechanism must be used to shuffle the train set at each epoch. ''' ############# # LOAD DATA # ############# train_set = self.traindata if maxlen: new_train_set_x = [] new_train_set_y = [] for x, y in zip(train_set[0], train_set[1]): if len(x) < maxlen: new_train_set_x.append(x) new_train_set_y.append(y) else: new_train_set_x.append(x[:maxlen]) new_train_set_y.append(y) train_set = (new_train_set_x, new_train_set_y) del new_train_set_x, new_train_set_y # split training set into validation set train_set_x, train_set_y = train_set n_samples = len(train_set_x) sidx = np.arange(n_samples, dtype='int32') np.random.shuffle(sidx) n_train = int(np.round(n_samples * (1. - valid_portion))) valid_set_x = [train_set_x[s] for s in sidx[n_train:]] valid_set_y = [train_set_y[s] for s in sidx[n_train:]] train_set_x = [train_set_x[s] for s in sidx[:n_train]] train_set_y = [train_set_y[s] for s in sidx[:n_train]] train_set = (train_set_x, train_set_y) valid_set = (valid_set_x, valid_set_y) valid_set_x, valid_set_y = valid_set train_set_x, train_set_y = train_set def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) if sort_by_len: sorted_index = len_argsort(valid_set_x) valid_set_x = [valid_set_x[i] for i in sorted_index] valid_set_y = [valid_set_y[i] for i in sorted_index] train = (train_set_x, train_set_y) valid = (valid_set_x, valid_set_y) return train, valid def get_minibatches_idx(self, n, minibatch_size, shuffle=False): """ Used to shuffle the dataset at each iteration. """ idx_list = np.arange(n, dtype="int32") if shuffle: np.random.shuffle(idx_list) minibatches = [] minibatch_start = 0 for i in range(n // minibatch_size): minibatches.append(idx_list[minibatch_start:minibatch_start + minibatch_size]) minibatch_start += minibatch_size if minibatch_start != n: # Make a minibatch out of what is left minibatches.append(idx_list[minibatch_start:]) return zip(range(len(minibatches)), minibatches) def get_dataset(self): return self.dataload[0], self.dataload[1] def predict_next(self, session_id, input_item_id, input_user_id, predict_for_item_ids, timestamp=0, skip=False, mode_type='view'): ''' Gives predicton scores for a selected set of items on how likely they be the next item in the session. Parameters -------- session_id : int or string The session IDs of the event. input_item_id : int or string The item ID of the event. predict_for_item_ids : 1D array IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. Returns -------- out : pandas.Series Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. ''' if (self.session != session_id): #new session self.session = session_id self.session_items = list() if mode_type == 'view': self.session_items.append(input_item_id) if skip: return x = [self.itemmap[self.session_items].values] y = x x, mask, y = self.prepare_data(x, y) preds = self.pred_function(x, mask) series = pd.Series(data=preds[0][1:], index=self.itemmap.index) if self.hasReminders: # user_based if self.reminder.remind_strategy == 'hybrid': series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key, input_timestamp=timestamp) else: # basic reminders series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key) return series def zipp(self, params, tparams): """ When we reload the model. Needed for the GPU stuff. """ for kk, vv in params.items(): tparams[kk].set_value(vv) def unzip(self, zipped): """ When we pickle the model. Needed for the GPU stuff. """ new_params = OrderedDict() for kk, vv in zipped.items(): new_params[kk] = vv.get_value() return new_params def dropout_layer(self, state_before, use_noise, trng, drop_p=0.5): retain = 1. - drop_p proj = T.switch(use_noise, (state_before * trng.binomial( state_before.shape, p=retain, n=1, dtype=state_before.dtype)), state_before * retain) return proj def _p(self, pp, name): return '%s_%s' % (pp, name) def init_params(self, options): """ Global (not GRU) parameter. For the embeding and the classifier. """ params = OrderedDict() # embedding params['Wemb'] = self.init_weights( (options['n_items'], options['dim_proj'])) params = self.get_layer(options['encoder'])[0]( options, params, prefix=options['encoder']) # attention params['W_encoder'] = self.init_weights( (options['hidden_units'], options['hidden_units'])) params['W_decoder'] = self.init_weights( (options['hidden_units'], options['hidden_units'])) params['bl_vector'] = self.init_weights((1, options['hidden_units'])) # classifier # params['U'] = init_weights((2*options['hidden_units'], options['n_items'])) # params['b'] = np.zeros((options['n_items'],)).astype(config.floatX) params['bili'] = self.init_weights( (options['dim_proj'], 2 * options['hidden_units'])) return params def load_params(self, path, params): pp = np.load(path) for kk, vv in params.items(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] return params def init_tparams(self, params): tparams = OrderedDict() for kk, pp in params.items(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams def get_layer(self, name): fns = self.layers[name] return fns def init_weights(self, shape): sigma = np.sqrt(2. / shape[0]) return self.numpy_floatX(np.random.randn(*shape) * sigma) def ortho_weight(self, ndim): W = np.random.randn(ndim, ndim) u, s, v = np.linalg.svd(W) return u.astype(self.floatX) def param_init_gru(self, options, params, prefix='gru'): """ Init the GRU parameter: :see: init_params """ Wxrz = np.concatenate([ self.init_weights((options['dim_proj'], options['hidden_units'])), self.init_weights((options['dim_proj'], options['hidden_units'])), self.init_weights((options['dim_proj'], options['hidden_units'])) ], axis=1) params[self._p(prefix, 'Wxrz')] = Wxrz Urz = np.concatenate([ self.ortho_weight(options['hidden_units']), self.ortho_weight(options['hidden_units']) ], axis=1) params[self._p(prefix, 'Urz')] = Urz Uh = self.ortho_weight(options['hidden_units']) params[self._p(prefix, 'Uh')] = Uh b = np.zeros((3 * options['hidden_units'], )) params[self._p(prefix, 'b')] = b.astype(self.floatX) return params def gru_layer(self, tparams, state_below, options, prefix='gru', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_): preact = T.dot(h_, tparams[self._p(prefix, 'Urz')]) preact += x_[:, 0:2 * options['hidden_units']] z = T.nnet.hard_sigmoid(_slice(preact, 0, options['hidden_units'])) r = T.nnet.hard_sigmoid(_slice(preact, 1, options['hidden_units'])) h = T.tanh( T.dot((h_ * r), tparams[self._p(prefix, 'Uh')]) + _slice(x_, 2, options['hidden_units'])) h = (1.0 - z) * h_ + z * h h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h state_below = (T.dot(state_below, tparams[self._p(prefix, 'Wxrz')]) + tparams[self._p(prefix, 'b')]) hidden_units = options['hidden_units'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=T.alloc( self.numpy_floatX(0.), n_samples, hidden_units), name=self._p(prefix, '_layers'), n_steps=nsteps) return rval def adam(self, loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1 - 1e-8): """ ADAM update rules Default values are taken from [Kingma2014] References: [Kingma2014] Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf """ updates = OrderedDict() all_grads = theano.grad(loss, all_params) alpha = learning_rate t = theano.shared(np.float32(1).astype(self.floatX)) b1_t = b1 * gamma**( t - 1) #(Decay the first moment running average coefficient) for theta_previous, g in zip(all_params, all_grads): m_previous = theano.shared( np.zeros(theta_previous.get_value().shape, dtype=self.floatX)) v_previous = theano.shared( np.zeros(theta_previous.get_value().shape, dtype=self.floatX)) m = b1_t * m_previous + ( 1 - b1_t) * g # (Update biased first moment estimate) v = b2 * v_previous + ( 1 - b2) * g**2 # (Update biased second raw moment estimate) m_hat = m / (1 - b1**t ) # (Compute bias-corrected first moment estimate) v_hat = v / ( 1 - b2**t ) # (Compute bias-corrected second raw moment estimate) theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e ) #(Update parameters) updates[m_previous] = m updates[v_previous] = v updates[theta_previous] = theta updates[t] = t + 1. return updates def build_model(self, tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(self.numpy_floatX(0.)) x = T.matrix('x', dtype='int64') mask = T.matrix('mask', dtype=self.floatX) y = T.vector('y', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, options['dim_proj']]) if options['use_dropout']: emb = self.dropout_layer(emb, use_noise, trng, drop_p=0.25) proj = self.get_layer(options['encoder'])[1](tparams, emb, options, prefix=options['encoder'], mask=mask) def compute_alpha(state1, state2): tmp = T.nnet.hard_sigmoid( T.dot(tparams['W_encoder'], state1.T) + T.dot(tparams['W_decoder'], state2.T)) alpha = T.dot(tparams['bl_vector'], tmp) res = T.sum(alpha, axis=0) return res last_h = proj[-1] sim_matrix, _ = theano.scan(fn=compute_alpha, sequences=proj, non_sequences=proj[-1]) att = T.nnet.softmax(sim_matrix.T * mask.T) * mask.T p = att.sum(axis=1)[:, None] weight = att / p atttention_proj = (proj * weight.T[:, :, None]).sum(axis=0) proj = T.concatenate([atttention_proj, last_h], axis=1) if options['use_dropout']: proj = self.dropout_layer(proj, use_noise, trng, drop_p=0.5) ytem = T.dot(tparams['Wemb'], tparams['bili']) pred = T.nnet.softmax(T.dot(proj, ytem.T)) # pred = T.nnet.softmax(T.dot(proj, tparams['U']) + tparams['b']) f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') # f_weight = theano.function([x, mask], weight, name='f_weight') off = 1e-8 if pred.dtype == 'float16': off = 1e-6 cost = -T.log(pred[T.arange(n_samples), y] + off).mean() return use_noise, x, mask, y, f_pred_prob, cost def pred_evaluation(self, f_pred_prob, prepare_data, data, iterator): """ Compute recall@20 and mrr@20 f_pred_prob: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ recall = 0.0 mrr = 0.0 evalutation_point_count = 0 # pred_res = [] # att = [] for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], np.array(data[1])[valid_index]) preds = f_pred_prob(x, mask) # weights = f_weight(x, mask) targets = y ranks = (preds.T > np.diag(preds.T[targets])).sum(axis=0) + 1 rank_ok = (ranks <= 20) # pred_res += list(rank_ok) recall += rank_ok.sum() mrr += (1.0 / ranks[rank_ok]).sum() evalutation_point_count += len(ranks) # att.append(weights) recall = self.numpy_floatX(recall) / evalutation_point_count mrr = self.numpy_floatX(mrr) / evalutation_point_count eval_score = (recall, mrr) # ff = open('/storage/lijing/mydataset/res_attention_correct.pkl', 'wb') # pickle.dump(pred_res, ff) # ff.close() # ff2 = open('/storage/lijing/mydataset/attention_weights.pkl', 'wb') # pickle.dump(att, ff2) # ff2.close() return eval_score def numpy_floatX(self, data): return np.asarray(data, dtype=self.floatX) def clear(self): if hasattr(self, 'tparams'): for kk, vv in self.tparams.items(): if len(self.params[kk].shape) == 1: self.tparams[kk].set_value([]) else: self.tparams[kk].set_value([[]]) def support_users(self): ''' whether it is a session-based or session-aware algorithm (if returns True, method "predict_with_training_data" must be defined as well) Parameters -------- Returns -------- True : if it is session-aware False : if it is session-based ''' return True def predict_with_training_data(self): ''' (this method must be defined if "support_users is True") whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions) Parameters -------- Returns -------- True : e.g. hgru4rec False : e.g. uvsknn ''' return False def extend_session_model_in_loop(self, row, index_user, index_item): if not row[index_user] in self.last_user_items: self.last_user_items[row[index_user]] = [ ] # create a new list to save the user's last viewed items self.last_user_items[row[index_user]].append(row[index_item]) if len(self.last_user_items[ row[index_user]]) > self.extend_session_length: self.last_user_items[row[index_user]] = self.last_user_items[ row[index_user]][-self.extend_session_length:] def predict_for_extended_model( self, input_user_id ): # will be called in "evaluation_user_based" and "evaluation_user_based_multiple" prev_items = [] # we add extra items form the user profile at the begening of the session self.session_items = list() if self.extend_session_length is not None and input_user_id in self.last_user_items: # user_based prev_items = self.last_user_items[input_user_id] return prev_items
class UVMContextKNN: ''' VMContextKNN( k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time') Parameters ----------- k : int Number of neighboring session to calculate the item scores from. (Default value: 100) sample_size : int Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500) sampling : string String to define the sampling method for sessions (recent, random). (default: recent) similarity : string String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard) weighting : string Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div) weighting_score : string Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score) weighting_time : boolean Experimental function to give less weight to items from older sessions (default: False) dwelling_time : boolean Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False) last_n_days : int Use only data from the last N days. (default: None) last_n_clicks : int Use only the last N clicks of the current session when recommending. (default: None) extend : bool Add evaluated sessions to the maps. normalize : bool Normalize the scores in the end. session_key : string Header of the session ID column in the input file. (default: 'SessionId') item_key : string Header of the item ID column in the input file. (default: 'ItemId') time_key : string Header of the timestamp column in the input file. (default: 'Time') user_key : string Header of the user ID column in the input file. (default: 'UserId') extend_session_length: int extend the current user's session extending_mode: string how extend the user session (default: lastViewed) lastViewed: extend the current user's session with the his/her last viewed items #TODO: now it saves just X last items, and they might be exactly the same: can try as well: save 5 distinctive items score_based: higher score: if the items appeared in more previous sessions AND more recently #TODO boost_own_sessions: double to increase the impact of (give weight more weight to) the sessions which belong to the user. (default: None) the value will be added to 1.0. For example for boost_own_sessions=0.2, weight will be 1.2 past_neighbors: bool Include the neighbours of the past user's similar sessions (to the current session) as neighbours (default: False) reminders: bool Include reminding items in the (main) recommendation list. (default: False) remind_strategy: string Ranking strategy of the reminding list (recency, session_similarity). (default: recency) remind_sessions_num: int Number of the last user's sessions that the possible items for reminding are taken from (default: 6) reminders_num: int length of the reminding list (default: 3) remind_mode: string The postion of the remining items in recommendation list (top, end). (default: end) ''' def __init__(self, k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, weighting_score='div', weighting_time=False, normalize=True, idf_weighting=False, idf_weighting_session=False, remind=True, push_reminders=False, add_reminders=False, extend=False, extending_mode='lastViewed', extend_session_length=None, refine_mode=True, boost_own_sessions=None, past_neighbors=False, reminders=False, remind_strategy='recency', remind_sessions_num=6, reminders_num=3, remind_mode='end', weight_base=1, weight_IRec=0, weight_SSim=0, session_key='SessionId', item_key='ItemId', time_key='Time', user_key='UserId'): self.k = k self.sample_size = sample_size self.sampling = sampling self.weighting = weighting self.dwelling_time = dwelling_time self.weighting_score = weighting_score self.weighting_time = weighting_time self.similarity = similarity self.session_key = session_key self.item_key = item_key self.time_key = time_key self.user_key = user_key # user_based self.idf_weighting = idf_weighting self.idf_weighting_session = idf_weighting_session self.normalize = normalize self.last_n_days = last_n_days self.last_n_clicks = last_n_clicks self.remind = remind # True/False. If False: items form the current session will be excluded self.push_reminders = push_reminders self.add_reminders = add_reminders self.extend = extend self.extending_mode = extending_mode # user_based self.extend_session_length = extend_session_length self.boost_own_sessions = boost_own_sessions self.past_neighbors = past_neighbors self.refine_mode = refine_mode # reminders self.hasReminders = reminders if self.hasReminders: if remind_strategy == 'hybrid': self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, weight_base=weight_base, weight_IRec=weight_IRec, weight_SSim=weight_SSim) else: # basic reminders self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, reminders_num=reminders_num, remind_mode=remind_mode) # updated while recommending self.session = -1 self.session_items = [] self.relevant_sessions = set() # user_based self.items_previous = [] self.last_user_items = {} # to extend the session model self.recent_user_items = {} # to remind self.recent_user_sessions = {} # to remind self.user_item_intensity = dict( ) # to remind (for 'session_similarity') # cache relations once at startup (in fit) self.session_item_map = dict() self.item_session_map = dict() self.session_time = dict() self.min_time = -1 self.session_user_map = dict() # user_based self.sim_time = 0 def fit(self, data, items=None): ''' Trains the predictor. Parameters -------- data: pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). ''' if self.last_n_days != None: max_time = dt.fromtimestamp(data[self.time_key].max()) date_threshold = max_time.date() - td(self.last_n_days) stamp = dt.combine(date_threshold, dt.min.time()).timestamp() train = data[data[self.time_key] >= stamp] else: train = data self.num_items = train[self.item_key].max() # get the position of the columns index_session = train.columns.get_loc(self.session_key) index_item = train.columns.get_loc(self.item_key) index_time = train.columns.get_loc(self.time_key) index_user = train.columns.get_loc(self.user_key) # user_based session = -1 session_items = set() time = -1 user = -1 # user_based # cnt = 0 prev_s_id = -1 for row in train.itertuples(index=False): # cache items of sessions if row[index_session] != session: if len(session_items) > 0: self.session_item_map.update({session: session_items}) # cache the last time stamp of the session self.session_time.update({session: time}) self.session_user_map.update({session: user}) # user_based if time < self.min_time: self.min_time = time user = row[index_user] # user_based session = row[index_session] session_items = set() time = row[index_time] session_items.add(row[index_item]) # cache sessions involving an item map_is = self.item_session_map.get(row[index_item]) if map_is is None: map_is = set() self.item_session_map.update({row[index_item]: map_is}) map_is.add(row[index_session]) # add last viewed items (by the user) to the last_user_items dictionary if self.extend_session_length is not None: # user_based self.extend_session_in_fit(row, index_user, index_item) # reminders if self.hasReminders: # user_based # for 'session_similarity' or 'recency' self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item) # reminders # save item_intensity in the last N session for each user if self.hasReminders: # user_based self.reminder.reminders_fit(train, self.user_key, self.item_key, self.time_key) # Add the last tuple self.session_item_map.update({session: session_items}) self.session_time.update({session: time}) self.session_user_map.update({session: user}) # user_based if self.idf_weighting or self.idf_weighting_session: self.idf = pd.DataFrame() self.idf['idf'] = train.groupby(self.item_key).size() self.idf['idf'] = np.log(train[self.session_key].nunique() / self.idf['idf']) self.idf = self.idf['idf'].to_dict() def predict_next(self, session_id, input_item_id, input_user_id, predict_for_item_ids=None, skip=False, mode_type='view', timestamp=0): ''' Gives predicton scores for a selected set of items on how likely they be the next item in the session. Parameters -------- session_id : int or string The session IDs of the event. input_item_id : int or string The item ID of the event. Must be in the set of item IDs of the training set. predict_for_item_ids : 1D array IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. Returns -------- out : pandas.Series Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. ''' # gc.collect() # process = psutil.Process(os.getpid()) # print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used') if (self.session != session_id): # new session if (self.extend): # add evaluated sessions to the maps. item_set = set(self.session_items) self.session_item_map[self.session] = item_set for item in item_set: map_is = self.item_session_map.get(item) if map_is is None: map_is = set() self.item_session_map.update({item: map_is}) map_is.add(self.session) ts = time.time() self.session_time.update({self.session: ts}) self.session_user_map.update({self.session: input_user_id}) # user_based self.last_ts = -1 self.session = session_id self.session_items = list() self.dwelling_times = list() self.relevant_sessions = set() self.items_previous = [] # user_based self.need_refine = True # user_based if mode_type == 'view': self.session_items.append(input_item_id) if self.dwelling_time: if self.last_ts > 0: self.dwelling_times.append(timestamp - self.last_ts) self.last_ts = timestamp if skip: return items = self.session_items if self.last_n_clicks is None else self.session_items[ -self.last_n_clicks:] # we add extra items form the user profile as long as the session is not long enough! if self.extend_session_length is not None and input_user_id in self.last_user_items: # user_based items = self.extend_session_in_predict_next(items, input_user_id) neighbors = self.find_neighbors(items, input_item_id, session_id, self.dwelling_times, timestamp, input_user_id) scores = self.score_items(neighbors, items, timestamp) # Create things in the format .. predictions = np.zeros(len(predict_for_item_ids)) mask = np.in1d(predict_for_item_ids, list(scores.keys())) predict_for_items = predict_for_item_ids[mask] values = [scores[x] for x in predict_for_items] predictions[mask] = values series = pd.Series(data=predictions, index=predict_for_item_ids) if self.hasReminders: # user_based if self.reminder.remind_strategy == 'hybrid': if self.reminder.w_SSim == 0: series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key, input_timestamp=timestamp) else: past_user_sessions = self.calc_similarity( items, self.reminder.recent_user_sessions[input_user_id], self.dwelling_times, timestamp, input_user_id) series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key, past_user_sessions=past_user_sessions, session_item_map=self.session_item_map, input_timestamp=timestamp) else: # basic reminders if self.reminder.remind_strategy == 'session_similarity': past_user_sessions = self.calc_similarity( items, self.reminder.recent_user_sessions[input_user_id], self.dwelling_times, timestamp, input_user_id) series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key, past_user_sessions=past_user_sessions, session_item_map=self.session_item_map) if self.reminder.remind_strategy == 'recency': series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key) if self.push_reminders: # give more score to the items that belongs to the current session if self.extend_session_length is not None and self.need_refine: session_items_series = pd.Series(items) else: session_items_series = pd.Series(self.session_items) session_count = session_items_series.groupby( session_items_series).count() + 1 # multiply the score of the item in the current session by its number of time in the current session series[ session_count. index] *= session_count # TODO: contains the same item several times if self.add_reminders: # force the last 3 items of the current session to be in the top 20 recommendable items if self.extend_session_length is not None and self.need_refine: # scores of the items that are belong to the current session session_series = pd.Series(index=items, data=series[items]) else: # scores of the items that are belong to the current session session_series = pd.Series(index=self.session_items, data=series[self.session_items]) session_series = session_series[session_series > 0] # keep them if their scores > 0 if len(session_series) > 0: session_series = session_series.iloc[: 3] # keep the first 3 items # sort the predictions (sort recommendable items according to their scores) series.sort_values(ascending=False, inplace=True) session_series = session_series[session_series < series.iloc[19 - 3]] # TODO: 19-3 series[session_series. index] = series.iloc[19 - 3] + 1e-4 # TODO: 1e-4 if self.normalize: series = series / series.max() return series def item_pop(self, sessions): ''' Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids) Parameters -------- sessions: set Returns -------- out : dict ''' result = dict() max_pop = 0 for session, weight in sessions: items = self.items_for_session(session) for item in items: count = result.get(item) if count is None: result.update({item: 1}) else: result.update({item: count + 1}) if (result.get(item) > max_pop): max_pop = result.get(item) for key in result: result.update({key: (result[key] / max_pop)}) return result def jaccard(self, first, second): ''' Calculates the jaccard index for two sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' sc = time.clock() intersection = len(first & second) union = len(first | second) res = intersection / union self.sim_time += (time.clock() - sc) return res def cosine(self, first, second): ''' Calculates the cosine similarity for two sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' li = len(first & second) la = len(first) lb = len(second) result = li / sqrt(la) * sqrt(lb) return result def tanimoto(self, first, second): ''' Calculates the cosine tanimoto similarity for two sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' li = len(first & second) la = len(first) lb = len(second) result = li / (la + lb - li) return result def binary(self, first, second): ''' Calculates the ? for 2 sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' a = len(first & second) b = len(first) c = len(second) result = (2 * a) / ((2 * a) + b + c) return result def vec(self, first, second, map): ''' Calculates the ? for 2 sessions Parameters -------- first: Id of a session second: Id of a session Returns -------- out : float value ''' a = first & second sum = 0 for i in a: sum += map[i] result = sum / len(map) return result def items_for_session(self, session): ''' Returns all items in the session Parameters -------- session: Id of a session Returns -------- out : set ''' return self.session_item_map.get(session) def vec_for_session(self, session): ''' Returns all items in the session Parameters -------- session: Id of a session Returns -------- out : set ''' return self.session_vec_map.get(session) def sessions_for_item(self, item_id): ''' Returns all session for an item Parameters -------- item: Id of the item session Returns -------- out : set ''' return self.item_session_map.get( item_id) if item_id in self.item_session_map else set() def most_recent_sessions(self, sessions, number): ''' Find the most recent sessions in the given set Parameters -------- sessions: set of session ids Returns -------- out : set ''' sample = set() tuples = list() for session in sessions: time = self.session_time.get(session) if time is None: print(' EMPTY TIMESTAMP!! ', session) tuples.append((session, time)) tuples = sorted(tuples, key=itemgetter(1), reverse=True) # print 'sorted list ', sortedList cnt = 0 for element in tuples: cnt = cnt + 1 if cnt > number: break sample.add(element[0]) # print 'returning sample of size ', len(sample) return sample def possible_neighbor_sessions(self, session_items, input_item_id, session_id, user_id): ''' Find a set of session to later on find neighbors in. A self.sample_size of 0 uses all sessions in which any item of the current session appears. self.sampling can be performed with the options "recent" or "random". "recent" selects the self.sample_size most recent sessions while "random" just choses randomly. Parameters -------- sessions: set of session ids Returns -------- out : set ''' # add relevant sessions for the current item self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( input_item_id) if self.past_neighbors: # user-based self.retrieve_past_neighbors(user_id) if self.sample_size == 0: # use all session as possible neighbors print('!!!!! runnig KNN without a sample size (check config)') possible_neighbors = self.relevant_sessions else: # sample some sessions if len(self.relevant_sessions) > self.sample_size: if self.sampling == 'recent': sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size) elif self.sampling == 'random': sample = random.sample(self.relevant_sessions, self.sample_size) else: sample = self.relevant_sessions[:self.sample_size] possible_neighbors = sample else: possible_neighbors = self.relevant_sessions return possible_neighbors def calc_similarity(self, session_items, sessions, dwelling_times, timestamp, user_id): ''' Calculates the configured similarity for the items in session_items and each session in sessions. Parameters -------- session_items: set of item ids sessions: list of session ids Returns -------- out : list of tuple (session_id,similarity) ''' pos_map = {} length = len(session_items) count = 1 for item in session_items: if self.weighting is not None: pos_map[item] = getattr(self, self.weighting)(count, length) count += 1 else: pos_map[item] = 1 if self.dwelling_time: dt = dwelling_times.copy() dt.append(0) dt = pd.Series(dt, index=session_items) dt = dt / dt.max() # dt[session_items[-1]] = dt.mean() if len(session_items) > 1 else 1 dt[session_items[-1]] = 1 # print(dt) for i in range(len(dt)): pos_map[session_items[i]] *= dt.iloc[i] # print(pos_map) if self.idf_weighting_session: max = -1 for item in session_items: pos_map[item] = self.idf[item] if item in self.idf else 0 # if pos_map[item] > max: # max = pos_map[item] # for item in session_items: # pos_map[item] = pos_map[item] / max # print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric items = set(session_items) neighbors = [] cnt = 0 for session in sessions: cnt = cnt + 1 # get items of the session, look up the cache first n_items = self.items_for_session(session) sts = self.session_time[session] # dot product similarity = self.vec(items, n_items, pos_map) if similarity > 0: if self.weighting_time: diff = timestamp - sts days = round(diff / 60 / 60 / 24) decay = pow(7 / 8, days) similarity *= decay # print("days:",days," => ",decay) if self.boost_own_sessions is not None: # user_based similarity = self.apply_boost(session, user_id, similarity) neighbors.append((session, similarity)) return neighbors # ----------------- # Find a set of neighbors, returns a list of tuples (sessionid: similarity) # ----------------- def find_neighbors(self, session_items, input_item_id, session_id, dwelling_times, timestamp, user_id): ''' Finds the k nearest neighbors for the given session_id and the current item input_item_id. Parameters -------- session_items: set of item ids input_item_id: int session_id: int Returns -------- out : list of tuple (session_id, similarity) ''' possible_neighbors = self.possible_neighbor_sessions( session_items, input_item_id, session_id, user_id) # user_based possible_neighbors = self.calc_similarity(session_items, possible_neighbors, dwelling_times, timestamp, user_id) # user_based possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1]) possible_neighbors = possible_neighbors[:self.k] return possible_neighbors def score_items(self, neighbors, current_session, timestamp): ''' Compute a set of scores for all items given a set of neighbors. Parameters -------- neighbors: set of session ids Returns -------- out : list of tuple (item, score) ''' # now we have the set of relevant items to make predictions scores = dict() item_set = set(current_session) # iterate over the sessions for session in neighbors: # get the items in this session items = self.items_for_session(session[0]) step = 1 for item in reversed(current_session): if item in items: decay = getattr(self, self.weighting_score + '_score')(step) break step += 1 for item in items: if not self.remind and item in item_set: # d=0* (exclude items form the current session) continue # dont want to remind user the item that is already is his current (extended) session old_score = scores.get(item) new_score = session[1] new_score = new_score if not self.idf_weighting else new_score + ( new_score * self.idf[item] * self.idf_weighting) new_score = new_score * decay if not old_score is None: new_score = old_score + new_score scores.update({item: new_score}) return scores def linear_score(self, i): return 1 - (0.1 * i) if i <= 100 else 0 def same_score(self, i): return 1 def div_score(self, i): return 1 / i def log_score(self, i): return 1 / (log10(i + 1.7)) def quadratic_score(self, i): return 1 / (i * i) def linear(self, i, length): return 1 - (0.1 * (length - i)) if i <= 10 else 0 def same(self, i, length): return 1 def div(self, i, length): return i / length def log(self, i, length): return 1 / (log10((length - i) + 1.7)) def quadratic(self, i, length): return (i / length)**2 def clear(self): self.session = -1 self.session_items = [] self.relevant_sessions = set() self.session_item_map = dict() self.item_session_map = dict() self.session_time = dict() self.session_user_map = dict() # user_based def support_users(self): ''' whether it is a session-based or session-aware algorithm (if returns True, method "predict_with_training_data" must be defined as well) Parameters -------- Returns -------- True : if it is session-aware False : if it is session-based ''' return True def predict_with_training_data(self): ''' (this method must be defined if "support_users is True") whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions) Parameters -------- Returns -------- True : e.g. hgru4rec False : e.g. uvsknn ''' return False def extend_session_in_fit(self, row, index_user, index_item): if not row[index_user] in self.last_user_items: # create a new list to save the user's last viewed items self.last_user_items[row[index_user]] = [] self.last_user_items[row[index_user]].append(row[index_item]) if len(self.last_user_items[ row[index_user]]) > self.extend_session_length: self.last_user_items[row[index_user]] = self.last_user_items[ row[index_user]][-self.extend_session_length:] def extend_session_in_predict_next(self, items, input_user_id): if len(items) < self.extend_session_length: # update the session with items from the users past n = len(self.session_items) addItems = self.extend_session_length - n prev_items = self.last_user_items[input_user_id][-addItems:] items = prev_items + self.session_items # if it is beginning of the session => find relevant sessions for added items if len(self.items_previous) == 0: for item in set(prev_items): self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( item) # not beginning of the session, so we already retrieved neighbours for the extended session elif self.refine_mode: # if the first item that was in the previous step, is not in the current step anymore => refine the self.relevant_sessions if not self.items_previous[0] in items: self.relevant_sessions = set() for item in set(items): self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( item) # update self.items_previous self.items_previous = items # the session is long enough => refine the self.relevant_sessions to just consider current session's items elif self.refine_mode and self.need_refine: self.relevant_sessions = set() for item in set(self.session_items): # then we can continue with just adding related sessions for the current item self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( item) # refined once after reach to the defined length, no need to do refine anymore self.need_refine = False return items def apply_boost(self, session, user_id, similarity): if self.boost_own_sessions > 0.0 and self.session_user_map[ session] == user_id: similarity = similarity + (similarity * self.boost_own_sessions) return similarity def retrieve_past_neighbors(self, user_id): for neighbor_sid in self.relevant_sessions: if self.session_user_map[neighbor_sid] == user_id: for item in self.items_for_session(neighbor_sid): self.relevant_sessions = self.relevant_sessions | self.sessions_for_item( item)
class USequentialRules: ''' Code based on work by Kamehkhosh et al.,A Comparison of Frequent Pattern Techniques and a Deep Learning Method for Session-Based Recommendation, TempRec Workshop at ACM RecSys 2017. SequentialRules(steps = 3, weighting='div', pruning=0.0) Parameters -------- steps : int TODO. (Default value: 3) weighting : string TODO. (Default value: 3) pruning : float TODO. (Default value: 0) session_key : string Header of the session ID column in the input file. (default: 'SessionId') item_key : string Header of the item ID column in the input file. (default: 'ItemId') time_key : string Header of the timestamp column in the input file. (default: 'Time') user_key : string Header of the user ID column in the input file. (default: 'UserId') boost_own_sessions: double to increase the impact of (give weight more weight to) the sessions which belong to the user. (default: None) the value will be added to 1.0. For example for boost_own_sessions=0.2, weight will be 1.2 reminders: bool Include reminding items in the (main) recommendation list. (default: False) remind_strategy: string Ranking strategy of the reminding list (default: recency) remind_sessions_num: int Number of the last user's sessions that the possible items for reminding are taken from (default: 6) reminders_num: int length of the reminding list (default: 3) remind_mode: string The postion of the remining items in recommendation list (top, end). (default: end) ''' def __init__(self, steps=10, weighting='div', pruning=20, last_n_days=None, idf_weight=False, last_in_session=False, session_weighting='div', boost_own_sessions=None, reminders=False, remind_strategy='recency', remind_sessions_num=6, reminders_num=3, remind_mode='end', weight_base=1, weight_IRec=0, session_key='SessionId', item_key='ItemId', time_key='Time', user_key='UserId'): self.steps = steps self.pruning = pruning self.weighting = weighting self.session_weighting = session_weighting self.last_n_days = last_n_days self.idf_weight = idf_weight self.last_in_session = last_in_session self.session_key = session_key self.item_key = item_key self.time_key = time_key self.session = -1 self.session_items = [] # user_based self.user_key = user_key self.boost_own_sessions = boost_own_sessions self.hasReminders = reminders if self.hasReminders: if remind_strategy == 'hybrid': self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, weight_base=weight_base, weight_IRec=weight_IRec) else: # basic reminders self.reminder = Reminder( remind_strategy=remind_strategy, remind_sessions_num=remind_sessions_num, reminders_num=reminders_num, remind_mode=remind_mode) def fit(self, data, test=None): ''' Trains the predictor. Parameters -------- data: pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). ''' if self.last_n_days != None: max_time = dt.fromtimestamp(data[self.time_key].max()) date_threshold = max_time.date() - td(self.last_n_days) stamp = dt.combine(date_threshold, dt.min.time()).timestamp() train = data[data[self.time_key] >= stamp] else: train = data if self.idf_weight: self.idf = self.compute_idf(data, item_key=self.item_key, session_key=self.session_key) cur_session = -1 last_items = [] rules = dict() # In SR: rule-set is a dic like: {item_a: {item_b: score}, item_b: {item_c: score, item_d: score, item_a: score}} # In user-based SR: rule-set is a dic like: {item_a: {item_b: [score, {userId1}]}, item_b: {item_c: [score, {userId_1, userId_2}], item_d: [score, {userId_2, userId_3}], item_a: [score, {userId_4}]}} # fist element of the list is : score, the rest is a SET of user ids who had this rule in their past sessions # get the position of the columns index_session = train.columns.get_loc(self.session_key) index_item = train.columns.get_loc(self.item_key) index_user = train.columns.get_loc(self.user_key) # user_based for row in train.itertuples(index=False): session_id, item_id, user_id = row[index_session], row[ index_item], row[index_user] if session_id != cur_session: cur_session = session_id last_items = [] else: for i in range( 1, self.steps + 1 if len(last_items) >= self.steps else len(last_items) + 1): prev_item = last_items[-i] if not prev_item in rules: rules[prev_item] = dict() if not item_id in rules[prev_item]: userSet = set() rules[prev_item][item_id] = [0, userSet] if not user_id in rules[prev_item][item_id][ 1]: # in userSet rules[prev_item][item_id][1].add(user_id) weight = getattr(self, self.weighting)(i) if self.idf_weight: if self.idf_weight == 1: weight *= self.idf[prev_item] elif self.idf_weight == 2: weight += self.idf[prev_item] rules[prev_item][item_id][0] += weight last_items.append(item_id) # reminders if self.hasReminders: # user_based # for 'session_similarity' or 'recency' self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item) # prev_s_id = self.reminder.reminders_fit_in_loop(row, index_user, index_session, index_item, prev_s_id) if self.pruning > 0: self.prune(rules) self.rules = rules # reminders if self.hasReminders: # user_based self.reminder.reminders_fit(train, self.user_key, self.item_key, self.time_key) # print( 'Size of map: ', asizeof.asizeof(self.rules)) def linear(self, i): return 1 - (0.1 * i) if i <= 100 else 0 def same(self, i): return 1 def div(self, i): return 1 / i def log(self, i): return 1 / (log10(i + 1.7)) def quadratic(self, i): return 1 / (i * i) def predict_next(self, session_id, input_item_id, input_user_id, predict_for_item_ids, skip=False, mode_type='view', timestamp=0): ''' Gives predicton scores for a selected set of items on how likely they be the next item in the session. Parameters -------- session_id : int or string The session IDs of the event. input_item_id : int or string The item ID of the event. predict_for_item_ids : 1D array IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. Returns -------- out : pandas.Series Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. ''' if session_id != self.session: self.session_items = [] self.session = session_id if mode_type == 'view': self.session_items.append(input_item_id) if skip: return preds = np.zeros(len(predict_for_item_ids)) # useless: add extend_session_length to make predictions if input_item_id in self.rules: for key in self.rules[input_item_id]: # preds[predict_for_item_ids == key] = self.rules[input_item_id][key] preds[predict_for_item_ids == key] = self.rules[input_item_id][key][0] if self.boost_own_sessions is not None and self.boost_own_sessions > 0.0 and input_user_id in self.rules[ input_item_id][key][ 1]: # if the rule also belong to the same user_id, then boost its score! preds[predict_for_item_ids == key] = preds[ predict_for_item_ids == key] + self.rules[ input_item_id][key][0] * self.boost_own_sessions if self.last_in_session: for i in range(2, self.last_in_session + 2): if len(self.session_items) >= i: item = self.session_items[-i] if item in self.rules: for key in self.rules[item]: preds[predict_for_item_ids == key] += self.rules[item][key] * getattr( self, self.session_weighting)(i) else: break # test # for i in range(2,4): # if len(self.session_items) >= i : # item = self.session_items[-i] # for key in self.rules[ item ]: # preds[ predict_for_item_ids == key ] += self.rules[item][key] * (1/i) series = pd.Series(data=preds, index=predict_for_item_ids) series = series / series.max() if self.hasReminders: # user_based if self.reminder.remind_strategy == 'hybrid': series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key, input_timestamp=timestamp) else: # basic reminders series = self.reminder.reminders_predict_next( input_user_id, series, self.item_key, self.time_key) return series def prune(self, rules): ''' Gives predicton scores for a selected set of items on how likely they be the next item in the session. Parameters -------- rules : dict of dicts The rules mined from the training data ''' for k1 in rules: tmp = rules[k1] if self.pruning < 1: keep = len(tmp) - int(len(tmp) * self.pruning) elif self.pruning >= 1: keep = self.pruning counter = col.Counter(tmp) rules[k1] = dict() for k2, v in counter.most_common(keep): rules[k1][k2] = v def compute_idf(self, train, item_key="ItemId", session_key="SessionId"): idf = pd.DataFrame() idf['idf'] = train.groupby(item_key).size() idf['idf'] = np.log(train[session_key].nunique() / idf['idf']) idf['idf'] = (idf['idf'] - idf['idf'].min()) / (idf['idf'].max() - idf['idf'].min()) idf = idf['idf'].to_dict() return idf def clear(self): self.rules = {} def support_users(self): ''' whether it is a session-based or session-aware algorithm (if returns True, method "predict_with_training_data" must be defined as well) Parameters -------- Returns -------- True : if it is session-aware False : if it is session-based ''' return True def predict_with_training_data(self): ''' (this method must be defined if "support_users is True") whether it also needs to make prediction for training data or not (should we concatenate training and test data for making predictions) Parameters -------- Returns -------- True : e.g. hgru4rec False : e.g. uvsknn ''' return False