def _train(self): self.init_latent_factor() lr = self.learning_rate rr = self.regularization_rate epochs = self.epochs clock = Timer() rmat_array = self.rmat.toarray() for epoch in range(epochs): self.log.info("epoch {} started: ".format(epoch)) for uid in range(rmat_array.shape[0]): user = self.users[uid] ratings = rmat_array[uid, :] user_items = self.items[np.argwhere(ratings != 0).flatten()] select_samples = self.select_negatives(user_items) for item, rui in select_samples.items(): err = rui - self.predict(user, item) uid = self.users.get_loc(user) iid = self.items.get_loc(item) user_latent = self.user_p[uid, :] movie_latent = self.item_q[iid, :] # gradient descent self.user_p[uid, :] += lr * (err * movie_latent - rr * user_latent) self.item_q[iid, :] += lr * (err * user_latent - rr * movie_latent) e0 = clock.restart() loss = self.loss() e1 = clock.restart() self.log.info("loss: {}".format(loss)) self.log.info("time elapsed: {}, {}".format(e0, e1))
def eval(self, x_val): """ evaluate test data with RMSE and MAE :param x_val: is dataframe with 'user', 'item', 'rating' :return: RMSE, MAE """ clock = Timer() self.log.info("start evaluating with %d test samples ...", x_val.shape[0]) group = x_val.groupby('user') df_summary = pd.DataFrame() for user, df in tqdm(group): actual = df[['item', 'rating']].set_index('item')['rating'] pred = self.predict_for_user(user, items=actual.index) df_summary = df_summary.append( pd.DataFrame({ 'pred': pred, 'actual': actual })) rmse = RMSE(df_summary.pred, df_summary.actual) mae = MAE(df_summary.pred, df_summary.actual) e0 = clock.restart() self.log.info("rmse: %.3f, mae: %.3f", rmse, mae) self.log.info("evaluation takes %.3f", e0) return rmse, mae
def _save(self): c = Timer() # cache trained parameter with open(self.filename, 'wb') as outfile: pickle.dump(self.user_sim_matrix, outfile) e = c.restart() self.log.info("saving user_sim_matrix to %s takes %.3f", self.filename, e)
def predict_for_user(self, user, items=None, ratings=None): if items is not None: items = np.array(items) else: items = self.items.values valid_mask = self.items.get_indexer(items) >= 0 min_threshold = self.min_threshold min_nn = self.min_nn max_nn = self.max_nn item_sim = self.item_sim_matrix result = dict() if np.sum(~valid_mask) > 0: self.log.debug("user %s: %s are not valid", user, items[~valid_mask]) for e in items[~valid_mask]: result[e] = np.nan items = items[valid_mask] upos = self.users.get_loc(user) item_bias = None if self.bias is not None: item_bias = self.bias.get_item_bias() assert self.rmat.getformat() == 'csr' item_scores = _get_xs(self.rmat, upos) # narrow down to items were rated valid_item_index = np.argwhere(item_scores != 0) for item in items: ipos = self.items.get_loc(item) clock = Timer() # idx with descending similarities with itself sorted_idx = np.argsort(item_sim[ipos, valid_item_index])[::-1] item_idx = valid_item_index[sorted_idx] e0 = clock.restart() # sim need to meet min_threshold if min_threshold is not None: item_idx = item_idx[item_sim[ipos, item_idx] > min_threshold] if len(item_idx) < min_nn: self.log.debug( "item %s does not have enough neighbors (%s < %s)", item, len(item_idx), min_nn) result[item] = np.nan continue item_idx = item_idx[:max_nn] e1 = clock.restart() score = _nn_score(item_scores, item_sim, ipos, item_idx, item_bias) e2 = clock.restart() # print(e0, e1, e2) result[item] = score df = pd.Series(result) return df
def predict_for_user_numba(self, user, items=None, ratings=None): """ Doesn't not seem to improve the performance !!! who can make it better ? :param user: user id :param items: a list of item ids :param ratings: :return: """ min_threshold = self.min_threshold min_nn = self.min_nn max_nn = self.max_nn clock = Timer() rmat_c = self.rmat.tocsc() e0 = clock.restart() if items is not None: items = np.array(items) else: items = self.items.values e1 = clock.restart() items_idx = self.items.get_indexer(items) e2 = clock.restart() upos = self.users.get_loc(user) e3 = clock.restart() usims = self.user_sim_matrix[upos, :] e4 = clock.restart() result = _score(rmat_c, usims, items_idx, min_threshold, min_nn, max_nn) e5 = clock.restart() # print(e0, e1, e2, e3, e4, e5) return pd.Series(result, index=items)
def get_data(filename, columns, delimiter='::'): """ :param filename: path of data source :param columns: column name for each column :param delimiter: delimiter to split a line :return: dataframe """ log = LogUtil.getLogger('get_data') clock = Timer() with open(os.path.join(filename), 'r') as infile: data = infile.readlines() df = pd.DataFrame([row.rstrip().split(delimiter) for row in data], columns=columns) e0 = clock.restart() log.info("loading data from %s with columns %s takes %.3f secs ", filename, columns, e0) return df
def fit(self, origin_data): clock = Timer() self.process_data(origin_data) e0 = clock.restart() self.log.info('loading init data takes %.3f secs...', e0) flag = self.filename is not None if flag and path.exists(self.filename): self._load() return _ = clock.restart() self.log.info('start training ...') self._train() e2 = clock.restart() self.log.info('training takes %.3f secs ...', e2) if flag: self._save() return self
def train_test_split(ratings, frac=0.1, group='user', seed=1): """ split data into train and test by frac if group is provide, split date into train and test by frac in each group """ log = LogUtil.getLogger('train_test_split') log.info("start splitting test and train data ...") clock = Timer() if group: ratings_test = ratings.groupby(group).apply( lambda x: x.sample(frac=frac, random_state=seed)) ratings_test.index = ratings_test.index.droplevel(group) else: ratings_test = ratings.sample(frac=frac, random_state=seed) ratings_train = pd.merge(ratings, ratings_test, indicator=True, how='outer').query('_merge=="left_only"').drop( '_merge', axis=1) e0 = clock.restart() log.info("splitting test and train data takes %.3f secs", e0) return ratings_train, ratings_test
def predict_for_user(self, user, items=None, ratings=None): if items is not None: items = np.array(items) else: items = self.items.values clock = Timer() uidx = self.users.get_loc(user) Xt = self.user_components[[uidx], :] # Transform Xt back to its original space. Xt.dot(Vt) pred = self.svd.inverse_transform(Xt) e0 = clock.restart() pred = pred.flatten() e1 = clock.restart() df = scores_to_series(pred, self.items, items) e2 = clock.restart() if self.bias is not None: bias_scores = self.bias.predict_for_user(user, items) df += bias_scores e3 = clock.restart() # print(e0, e1, e2, e3) return df
"item %s does not have enough neighbors (%s < %s)", item, len(item_idx), min_nn) result[item] = np.nan continue item_idx = item_idx[:max_nn] e1 = clock.restart() score = _nn_score(item_scores, item_sim, ipos, item_idx, item_bias) e2 = clock.restart() # print(e0, e1, e2) result[item] = score df = pd.Series(result) return df if __name__ == '__main__': LogUtil.configLog() ratings, users, movies = load_movielen_data() bias = Bias() itemcf = ItemCF(min_threshold=0.1, min_nn=5, bias=bias, popularity=0.5) print(itemcf.get_params()) itemcf.fit(ratings) user = 1 movies = list(movies.item.astype(int)) clock = Timer() for i in range(10): df = itemcf.predict_for_user(user, movies) print(clock.restart()) print(df.describe())
e0 = clock.restart() pred = pred.flatten() e1 = clock.restart() df = scores_to_series(pred, self.items, items) e2 = clock.restart() if self.bias is not None: bias_scores = self.bias.predict_for_user(user, items) df += bias_scores e3 = clock.restart() # print(e0, e1, e2, e3) return df if __name__ == '__main__': LogUtil.configLog() ratings, users, movies = load_movielen_data() bias = Bias() model = BiasedSVD(n_iter=40, n_factor=20, bias=bias) print(model.get_params()) model.fit(ratings) user = 1 movies = list(movies.item.astype(int)) clock = Timer() for i in range(10): df = model.predict_for_user(user, movies) # print(clock.restart()) print(df.describe())
def _load(self): c = Timer() with open(self.filename, 'rb') as infile: self.user_sim_matrix = pickle.load(infile) e = c.restart() self.log.info("loading user_sim_matrix from %s takes %.3f", self.filename, e)