예제 #1
0
    def _train(self):
        self.init_latent_factor()
        lr = self.learning_rate
        rr = self.regularization_rate
        epochs = self.epochs
        clock = Timer()
        rmat_array = self.rmat.toarray()

        for epoch in range(epochs):
            self.log.info("epoch {} started: ".format(epoch))
            for uid in range(rmat_array.shape[0]):
                user = self.users[uid]
                ratings = rmat_array[uid, :]
                user_items = self.items[np.argwhere(ratings != 0).flatten()]
                select_samples = self.select_negatives(user_items)
                for item, rui in select_samples.items():
                    err = rui - self.predict(user, item)
                    uid = self.users.get_loc(user)
                    iid = self.items.get_loc(item)
                    user_latent = self.user_p[uid, :]
                    movie_latent = self.item_q[iid, :]

                    # gradient descent
                    self.user_p[uid, :] += lr * (err * movie_latent -
                                                 rr * user_latent)
                    self.item_q[iid, :] += lr * (err * user_latent -
                                                 rr * movie_latent)

            e0 = clock.restart()
            loss = self.loss()
            e1 = clock.restart()
            self.log.info("loss: {}".format(loss))
            self.log.info("time elapsed: {}, {}".format(e0, e1))
예제 #2
0
    def eval(self, x_val):
        """
        evaluate test data with RMSE and MAE
        :param x_val: is dataframe  with 'user', 'item', 'rating'
        :return: RMSE, MAE
        """
        clock = Timer()
        self.log.info("start evaluating with %d test samples ...",
                      x_val.shape[0])
        group = x_val.groupby('user')
        df_summary = pd.DataFrame()
        for user, df in tqdm(group):
            actual = df[['item', 'rating']].set_index('item')['rating']
            pred = self.predict_for_user(user, items=actual.index)
            df_summary = df_summary.append(
                pd.DataFrame({
                    'pred': pred,
                    'actual': actual
                }))

        rmse = RMSE(df_summary.pred, df_summary.actual)
        mae = MAE(df_summary.pred, df_summary.actual)

        e0 = clock.restart()
        self.log.info("rmse: %.3f, mae: %.3f", rmse, mae)
        self.log.info("evaluation takes %.3f", e0)
        return rmse, mae
예제 #3
0
    def _save(self):
        c = Timer()
        # cache trained parameter
        with open(self.filename, 'wb') as outfile:
            pickle.dump(self.user_sim_matrix, outfile)
        e = c.restart()

        self.log.info("saving user_sim_matrix to %s takes %.3f", self.filename, e)
예제 #4
0
    def predict_for_user(self, user, items=None, ratings=None):
        if items is not None:
            items = np.array(items)
        else:
            items = self.items.values
        valid_mask = self.items.get_indexer(items) >= 0

        min_threshold = self.min_threshold
        min_nn = self.min_nn
        max_nn = self.max_nn
        item_sim = self.item_sim_matrix
        result = dict()
        if np.sum(~valid_mask) > 0:
            self.log.debug("user %s: %s are not valid", user,
                           items[~valid_mask])
            for e in items[~valid_mask]:
                result[e] = np.nan

        items = items[valid_mask]
        upos = self.users.get_loc(user)
        item_bias = None
        if self.bias is not None:
            item_bias = self.bias.get_item_bias()

        assert self.rmat.getformat() == 'csr'
        item_scores = _get_xs(self.rmat, upos)
        # narrow down to items were rated
        valid_item_index = np.argwhere(item_scores != 0)
        for item in items:
            ipos = self.items.get_loc(item)

            clock = Timer()
            # idx with descending similarities with itself
            sorted_idx = np.argsort(item_sim[ipos, valid_item_index])[::-1]
            item_idx = valid_item_index[sorted_idx]
            e0 = clock.restart()

            # sim need to meet min_threshold
            if min_threshold is not None:
                item_idx = item_idx[item_sim[ipos, item_idx] > min_threshold]
            if len(item_idx) < min_nn:
                self.log.debug(
                    "item %s does not have enough neighbors (%s < %s)", item,
                    len(item_idx), min_nn)
                result[item] = np.nan
                continue

            item_idx = item_idx[:max_nn]
            e1 = clock.restart()
            score = _nn_score(item_scores, item_sim, ipos, item_idx, item_bias)
            e2 = clock.restart()
            # print(e0, e1, e2)
            result[item] = score

        df = pd.Series(result)
        return df
예제 #5
0
    def predict_for_user_numba(self, user, items=None, ratings=None):
        """
        Doesn't not seem to improve the performance !!!
        who can make it better ?
        :param user: user id
        :param items: a list of item ids
        :param ratings:
        :return:
        """

        min_threshold = self.min_threshold
        min_nn = self.min_nn
        max_nn = self.max_nn
        clock = Timer()
        rmat_c = self.rmat.tocsc()
        e0 = clock.restart()
        if items is not None:
            items = np.array(items)
        else:
            items = self.items.values
        e1 = clock.restart()
        items_idx = self.items.get_indexer(items)
        e2 = clock.restart()
        upos = self.users.get_loc(user)
        e3 = clock.restart()
        usims = self.user_sim_matrix[upos, :]
        e4 = clock.restart()
        result = _score(rmat_c, usims, items_idx, min_threshold, min_nn, max_nn)
        e5 = clock.restart()
        # print(e0, e1, e2, e3, e4, e5)

        return pd.Series(result, index=items)
예제 #6
0
def get_data(filename, columns, delimiter='::'):
    """
    :param filename: path of data source
    :param columns: column name for each column
    :param delimiter: delimiter to split a line
    :return: dataframe
    """
    log = LogUtil.getLogger('get_data')
    clock = Timer()
    with open(os.path.join(filename), 'r') as infile:
        data = infile.readlines()
        df = pd.DataFrame([row.rstrip().split(delimiter) for row in data],
                          columns=columns)

    e0 = clock.restart()
    log.info("loading data from %s with columns %s takes %.3f secs  ",
             filename, columns, e0)
    return df
예제 #7
0
    def fit(self, origin_data):
        clock = Timer()
        self.process_data(origin_data)
        e0 = clock.restart()
        self.log.info('loading init data takes %.3f secs...', e0)
        flag = self.filename is not None
        if flag and path.exists(self.filename):
            self._load()
            return

        _ = clock.restart()
        self.log.info('start training ...')
        self._train()
        e2 = clock.restart()
        self.log.info('training takes %.3f secs ...', e2)
        if flag:
            self._save()
        return self
예제 #8
0
def train_test_split(ratings, frac=0.1, group='user', seed=1):
    """
        split data into train and test by frac
        if group is provide, split date into train and test by frac in each group
    """
    log = LogUtil.getLogger('train_test_split')
    log.info("start splitting test and train data ...")
    clock = Timer()
    if group:
        ratings_test = ratings.groupby(group).apply(
            lambda x: x.sample(frac=frac, random_state=seed))
        ratings_test.index = ratings_test.index.droplevel(group)
    else:
        ratings_test = ratings.sample(frac=frac, random_state=seed)

    ratings_train = pd.merge(ratings,
                             ratings_test,
                             indicator=True,
                             how='outer').query('_merge=="left_only"').drop(
                                 '_merge', axis=1)

    e0 = clock.restart()
    log.info("splitting test and train data takes %.3f secs", e0)
    return ratings_train, ratings_test
예제 #9
0
 def predict_for_user(self, user, items=None, ratings=None):
     if items is not None:
         items = np.array(items)
     else:
         items = self.items.values
     clock = Timer()
     uidx = self.users.get_loc(user)
     Xt = self.user_components[[uidx], :]
     # Transform Xt back to its original space. Xt.dot(Vt)
     pred = self.svd.inverse_transform(Xt)
     e0 = clock.restart()
     pred = pred.flatten()
     e1 = clock.restart()
     df = scores_to_series(pred, self.items, items)
     e2 = clock.restart()
     if self.bias is not None:
         bias_scores = self.bias.predict_for_user(user, items)
         df += bias_scores
     e3 = clock.restart()
     # print(e0, e1, e2, e3)
     return df
예제 #10
0
                    "item %s does not have enough neighbors (%s < %s)", item,
                    len(item_idx), min_nn)
                result[item] = np.nan
                continue

            item_idx = item_idx[:max_nn]
            e1 = clock.restart()
            score = _nn_score(item_scores, item_sim, ipos, item_idx, item_bias)
            e2 = clock.restart()
            # print(e0, e1, e2)
            result[item] = score

        df = pd.Series(result)
        return df


if __name__ == '__main__':
    LogUtil.configLog()
    ratings, users, movies = load_movielen_data()
    bias = Bias()
    itemcf = ItemCF(min_threshold=0.1, min_nn=5, bias=bias, popularity=0.5)
    print(itemcf.get_params())
    itemcf.fit(ratings)
    user = 1
    movies = list(movies.item.astype(int))
    clock = Timer()
    for i in range(10):
        df = itemcf.predict_for_user(user, movies)
        print(clock.restart())

    print(df.describe())
예제 #11
0
        e0 = clock.restart()
        pred = pred.flatten()
        e1 = clock.restart()
        df = scores_to_series(pred, self.items, items)
        e2 = clock.restart()
        if self.bias is not None:
            bias_scores = self.bias.predict_for_user(user, items)
            df += bias_scores
        e3 = clock.restart()
        # print(e0, e1, e2, e3)
        return df


if __name__ == '__main__':

    LogUtil.configLog()

    ratings, users, movies = load_movielen_data()
    bias = Bias()
    model = BiasedSVD(n_iter=40, n_factor=20, bias=bias)
    print(model.get_params())
    model.fit(ratings)
    user = 1
    movies = list(movies.item.astype(int))
    clock = Timer()
    for i in range(10):
        df = model.predict_for_user(user, movies)
        # print(clock.restart())

    print(df.describe())
예제 #12
0
 def _load(self):
     c = Timer()
     with open(self.filename, 'rb') as infile:
         self.user_sim_matrix = pickle.load(infile)
     e = c.restart()
     self.log.info("loading user_sim_matrix from %s takes %.3f", self.filename, e)