def compute_time_decay(self, df, decay_column): """Compute time decay on provided column. Args: df (pd.DataFrame): DataFrame of users and items decay_column (str): column to decay Returns: DataFrame: with column decayed """ # if time_now is None use the latest time if self.time_now is None: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating df[decay_column] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity return df.groupby([self.col_user, self.col_item]).sum().reset_index()
def test_exponential_decay(): values = np.array([1, 2, 3, 4, 5, 6]) expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1.0, 1.0]) actual = exponential_decay(value=values, max_val=5, half_life=2) assert np.allclose(actual, expected, atol=TOL)
def test_exponential_decay(): values = np.array([1, 2, 3, 4, 5, 6]) expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.]) actual = exponential_decay(value=values, max_val=5, half_life=2) assert np.allclose(actual, expected, atol=TOL)
def fit(self, df): """Main fit method for SAR Args: df (pd.DataFrame): User item rating dataframe """ # generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") # if time_now is None use the latest time if not self.time_now: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating temp_df[self.col_rating] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity temp_df = ( temp_df.groupby([self.col_user, self.col_item]).sum().reset_index() ) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates( [self.col_user, self.col_item], keep="last" ) logger.info("Creating index columns") # map users and items according to the two dicts. Add the two new columns to temp_df. temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index) # retain seen items for removal at prediction time self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( temp_df, self.n_users, self.n_items ) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix( temp_df, self.n_users, self.n_items ) # free up some space del temp_df self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type is COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type is JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence).astype( df[self.col_rating].dtype ) elif self.similarity_type is LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence).astype( df[self.col_rating].dtype ) else: raise ValueError("Unknown similarity type: {}".format(self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
def fit(self, df): """Main fit method for SAR Args: df (pd.DataFrame): User item rating dataframe """ # Generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.floating): raise TypeError("Rating column data type must be floating point") # Copy the DataFrame to avoid modification of the input temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") # if time_now is None use the latest time if not self.time_now: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating temp_df[self.col_rating] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity temp_df = (temp_df.groupby([self.col_user, self.col_item]).sum().reset_index()) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates([self.col_user, self.col_item], keep="last") logger.info("Creating index columns") # Map users and items according to the two dicts. Add the two new columns to temp_df. temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map( self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map( self.user2index) seen_items = None if self.remove_seen: # retain seen items for removal at prediction time seen_items = temp_df[[self.col_user_id, self.col_item_id]].values # Affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( temp_df, self.n_users, self.n_items) # Calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix( temp_df, self.n_users, self.n_items) # Free up some space del temp_df logger.info("Calculating item similarity") if self.similarity_type == sar.SIM_COOCCUR: self.item_similarity = item_cooccurrence elif self.similarity_type == sar.SIM_JACCARD: logger.info("Calculating jaccard") self.item_similarity = jaccard(item_cooccurrence) # Free up some space del item_cooccurrence elif self.similarity_type == sar.SIM_LIFT: logger.info("Calculating lift") self.item_similarity = lift(item_cooccurrence) # Free up some space del item_cooccurrence else: raise ValueError("Unknown similarity type: {0}".format( self.similarity_type)) # Calculate raw scores with a matrix multiplication logger.info("Calculating recommendation scores") self.scores = self.user_affinity.dot(self.item_similarity) # Remove items in the train set so recommended items are always novel if self.remove_seen: logger.info("Removing seen items") self.scores[seen_items[:, 0], seen_items[:, 1]] = -np.inf logger.info("Done training")